Revert "- Updated to 3.0-rc1."

author Jeff Mahoney <jeffm@suse.com>

Tue, 31 May 2011 03:09:44 +0000 (23:09 -0400)

committer Jeff Mahoney <jeffm@suse.com>

Tue, 31 May 2011 03:09:44 +0000 (23:09 -0400)
author Jeff Mahoney <jeffm@suse.com>
Tue, 31 May 2011 03:09:44 +0000 (23:09 -0400)
committer Jeff Mahoney <jeffm@suse.com>
Tue, 31 May 2011 03:09:44 +0000 (23:09 -0400)
diff --cc Documentation/kernel-parameters.txt

index 658476b,cc85a92..6d2c29c
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -819,6 -819,6 +819,24 @@@ bytes respectively. Such letter suffixe
         gpt             [EFI] Forces disk with valid GPT signature but
                         invalid Protective MBR to be treated as GPT.
   
++      guestdev=       [PCI,ACPI,XEN]
++                      Format: {<device path>|<sbdf>}][,{<device path>|<sbdf>}[,...]]
++                      Format of device path: <hid>[:<uid>]-<dev>.<func>[-<dev>.<func>[,...]][+iomul]
++                      Format of sbdf: [<segment>:]<bus>:<dev>.<func>[+iomul]
++                      Specifies PCI device for guest domain.
++                      If PCI-PCI bridge is specified, all PCI devices
++                      behind PCI-PCI bridge are reserved.
++                      +iomul means that this PCI function will share
++                      IO ports with other +iomul functions under same
++                      switch. NOTE: if +iomul is specfied, all the functions
++                      of the device will share IO ports.
++
++      guestiomuldev=  [PCI,ACPI,XEN]
++                      Format: [sbd][,<sbd>][,...]
++                      Format of sbdf: [<segment>:]<bus>:<dev>
++                      Note: function shouldn't be specified.
++                      Specifies PCI device for IO port multiplexing driver.
++
         hashdist=       [KNL,NUMA] Large hashes allocated during boot
                         are distributed across NUMA nodes.  Defaults on
                         for 64bit NUMA, off otherwise.
@@@ -2019,6 -2012,6 +2036,13 @@@
                                 off: Turn ECRC off
                                 on: Turn ECRC on.
   
++      pci_reserve=    [PCI]
++                      Format: [<sbdf>[+IO<size>][+MEM<size>]][,<sbdf>...]
++                      Format of sbdf: [<segment>:]<bus>:<dev>.<func>
++                      Specifies the least reserved io size or memory size
++                      which is assigned to PCI bridge even when no child
++                      pci device exists. This is useful with PCI hotplug.
++
         pcie_aspm=      [PCIE] Forcibly enable or disable PCIe Active State Power
                         Management.
                 off     Disable ASPM.
@@@ -2189,6 -2182,6 +2213,10 @@@
                         Run specified binary instead of /init from the ramdisk,
                         used for early userspace startup. See initrd.
   
++      reassign_resources      [PCI,ACPI,XEN]
++                      Use guestdev= parameter to reassign device's
++                      resources, or specify =all here.
++
         reboot=         [BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode
                         Format: <reboot_mode>[,<reboot_mode2>[,...]]
                         See arch/*/kernel/reboot.c or arch/*/kernel/process.c
diff --cc Documentation/sysctl/kernel.txt
Simple merge
diff --cc Documentation/transcendent-memory.txt

index 0000000,0000000..f1ac90d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/Documentation/transcendent-memory.txt
@@@ -1,0 -1,0 +1,176 @@@
++Normal memory is directly addressable by the kernel, of a known
++normally-fixed size, synchronously accessible, and persistent (though
++not across a reboot).
++
++What if there was a class of memory that is of unknown and dynamically
++variable size, is addressable only indirectly by the kernel, can be
++configured either as persistent or as "ephemeral" (meaning it will be
++around for awhile, but might disappear without warning), and is still
++fast enough to be synchronously accessible?
++
++We call this latter class "transcendent memory" and it provides an
++interesting opportunity to more efficiently utilize RAM in a virtualized
++environment.  However this "memory but not really memory" may also have
++applications in NON-virtualized environments, such as hotplug-memory
++deletion, SSDs, and page cache compression.  Others have suggested ideas
++such as allowing use of highmem memory without a highmem kernel, or use
++of spare video memory.
++
++Transcendent memory, or "tmem" for short, provides a well-defined API to
++access this unusual class of memory.  (A summary of the API is provided
++below.)  The basic operations are page-copy-based and use a flexible
++object-oriented addressing mechanism.  Tmem assumes that some "privileged
++entity" is capable of executing tmem requests and storing pages of data;
++this entity is currently a hypervisor and operations are performed via
++hypercalls, but the entity could be a kernel policy, or perhaps a
++"memory node" in a cluster of blades connected by a high-speed
++interconnect such as hypertransport or QPI.
++
++Since tmem is not directly accessible and because page copying is done
++to/from physical pageframes, it more suitable for in-kernel memory needs
++than for userland applications.  However, there may be yet undiscovered
++userland possibilities.
++
++With the tmem concept outlined vaguely and its broader potential hinted,
++we will overview two existing examples of how tmem can be used by the
++kernel.
++
++"Cleancache" can be thought of as a page-granularity victim cache for clean
++pages that the kernel's pageframe replacement algorithm (PFRA) would like
++to keep around, but can't since there isn't enough memory.   So when the
++PFRA "evicts" a page, it first puts it into the cleancache via a call to
++tmem.  And any time a filesystem reads a page from disk, it first attempts
++to get the page from cleancache.  If it's there, a disk access is eliminated.
++If not, the filesystem just goes to the disk like normal.  Cleancache is
++"ephemeral" so whether a page is kept in cleancache (between the "put" and
++the "get") is dependent on a number of factors that are invisible to
++the kernel.
++
++"Frontswap" is so named because it can be thought of as the opposite of
++a "backing store". Frontswap IS persistent, but for various reasons may not
++always be available for use, again due to factors that may not be visible to
++the kernel. (But, briefly, if the kernel is being "good" and has shared its
++resources nicely, then it will be able to use frontswap, else it will not.)
++Once a page is put, a get on the page will always succeed.  So when the
++kernel finds itself in a situation where it needs to swap out a page, it
++first attempts to use frontswap.  If the put works, a disk write and
++(usually) a disk read are avoided.  If it doesn't, the page is written
++to swap as usual.  Unlike cleancache, whether a page is stored in frontswap
++vs swap is recorded in kernel data structures, so when a page needs to
++be fetched, the kernel does a get if it is in frontswap and reads from
++swap if it is not in frontswap.
++
++Both cleancache and frontswap may be optionally compressed, trading off 2x
++space reduction vs 10x performance for access.  Cleancache also has a
++sharing feature, which allows different nodes in a "virtual cluster"
++to share a local page cache.
++
++Tmem has some similarity to IBM's Collaborative Memory Management, but
++creates more of a partnership between the kernel and the "privileged
++entity" and is not very invasive.  Tmem may be applicable for KVM and
++containers; there is some disagreement on the extent of its value.
++Tmem is highly complementary to ballooning (aka page granularity hot
++plug) and memory deduplication (aka transparent content-based page
++sharing) but still has value when neither are present.
++
++Performance is difficult to quantify because some benchmarks respond
++very favorably to increases in memory and tmem may do quite well on
++those, depending on how much tmem is available which may vary widely
++and dynamically, depending on conditions completely outside of the
++system being measured.  Ideas on how best to provide useful metrics
++would be appreciated.
++
++Tmem is supported starting in Xen 4.0 and is in Xen's Linux 2.6.18-xen
++source tree.  It is also released as a technology preview in Oracle's
++Xen-based virtualization product, Oracle VM 2.2.  Again, Xen is not
++necessarily a requirement, but currently provides the only existing
++implementation of tmem.
++
++Lots more information about tmem can be found at:
++  http://oss.oracle.com/projects/tmem
++and there was a talk about it on the first day of Linux Symposium in
++July 2009; an updated talk is planned at linux.conf.au in January 2010.
++Tmem is the result of a group effort, including Dan Magenheimer,
++Chris Mason, Dave McCracken, Kurt Hackel and Zhigang Wang, with helpful
++input from Jeremy Fitzhardinge, Keir Fraser, Ian Pratt, Sunil Mushran,
++Joel Becker, and Jan Beulich.
++
++THE TRANSCENDENT MEMORY API
++
++Transcendent memory is made up of a set of pools.  Each pool is made
++up of a set of objects.  And each object contains a set of pages.
++The combination of a 32-bit pool id, a 64-bit object id, and a 32-bit
++page id, uniquely identify a page of tmem data, and this tuple is called
++a "handle." Commonly, the three parts of a handle are used to address
++a filesystem, a file within that filesystem, and a page within that file;
++however an OS can use any values as long as they uniquely identify
++a page of data.
++
++When a tmem pool is created, it is given certain attributes: It can
++be private or shared, and it can be persistent or ephemeral.  Each
++combination of these attributes provides a different set of useful
++functionality and also defines a slightly different set of semantics
++for the various operations on the pool.  Other pool attributes include
++the size of the page and a version number.
++
++Once a pool is created, operations are performed on the pool.  Pages
++are copied between the OS and tmem and are addressed using a handle.
++Pages and/or objects may also be flushed from the pool.  When all
++operations are completed, a pool can be destroyed.
++
++The specific tmem functions are called in Linux through a set of
++accessor functions:
++
++int (*new_pool)(struct tmem_pool_uuid uuid, u32 flags);
++int (*destroy_pool)(u32 pool_id);
++int (*put_page)(u32 pool_id, u64 object, u32 index, unsigned long pfn);
++int (*get_page)(u32 pool_id, u64 object, u32 index, unsigned long pfn);
++int (*flush_page)(u32 pool_id, u64 object, u32 index);
++int (*flush_object)(u32 pool_id, u64 object);
++
++The new_pool accessor creates a new pool and returns a pool id
++which is a non-negative 32-bit integer.  If the flags parameter
++specifies that the pool is to be shared, the uuid is a 128-bit "shared
++secret" else it is ignored.  The destroy_pool accessor destroys the pool.
++(Note: shared pools are not supported until security implications
++are better understood.)
++
++The put_page accessor copies a page of data from the specified pageframe
++and associates it with the specified handle.
++
++The get_page accessor looks up a page of data in tmem associated with
++the specified handle and, if found, copies it to the specified pageframe.
++
++The flush_page accessor ensures that subsequent gets of a page with
++the specified handle will fail.  The flush_object accessor ensures
++that subsequent gets of any page matching the pool id and object
++will fail.
++
++There are many subtle but critical behaviors for get_page and put_page:
++- Any put_page (with one notable exception) may be rejected and the client
++  must be prepared to deal with that failure.  A put_page copies, NOT moves,
++  data; that is the data exists in both places.  Linux is responsible for
++  destroying or overwriting its own copy, or alternately managing any
++  coherency between the copies.
++- Every page successfully put to a persistent pool must be found by a
++  subsequent get_page that specifies the same handle.  A page successfully
++  put to an ephemeral pool has an indeterminate lifetime and even an
++  immediately subsequent get_page may fail.
++- A get_page to a private pool is destructive, that is it behaves as if
++  the get_page were atomically followed by a flush_page.  A get_page
++  to a shared pool is non-destructive.  A flush_page behaves just like
++  a get_page to a private pool except the data is thrown away.
++- Put-put-get coherency is guaranteed.  For example, after the sequence:
++        put_page(ABC,D1);
++        put_page(ABC,D2);
++        get_page(ABC,E)
++  E may never contain the data from D1.  However, even for a persistent
++  pool, the get_page may fail if the second put_page indicates failure.
++- Get-get coherency is guaranteed.  For example, in the sequence:
++        put_page(ABC,D);
++        get_page(ABC,E1);
++        get_page(ABC,E2)
++  if the first get_page fails, the second must also fail.
++- A tmem implementation provides no serialization guarantees (e.g. to
++  an SMP Linux).  So if different Linux threads are putting and flushing
++  the same page, the results are indeterminate.
diff --cc Makefile

index b630e26,123d858..92f3bc3
--- 1/Makefile
--- 2/Makefile
+++ b/Makefile
@@@ -411,8 -382,6 +402,7 @@@ export KBUILD_CFLAGS CFLAGS_KERNEL CFLA
   export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
   export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
   export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
- export KBUILD_ARFLAGS
+ +export KBUILD_KMSG_CHECK KMSG_CHECK
   
   # When compiling out-of-tree modules, put MODVERDIR in the module
   # tree rather than in the kernel tree. The kernel tree might
diff --cc arch/ia64/Kconfig

index 8f9000f,e5cc56a..5c80a77
--- 1/arch/ia64/Kconfig
--- 2/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@@ -227,7 -231,7 +231,7 @@@ config IA64_HP_SI
   config IA64_XEN_GUEST
         bool "Xen guest"
         select SWIOTLB
--      depends on XEN
++      depends on PARAVIRT_XEN
         help
           Build a kernel that runs on Xen guest domain. At this moment only
           16KB page size in supported.
diff --cc arch/ia64/Makefile

index be7bfa1,be7bfa1..342907d
--- 1/arch/ia64/Makefile
--- 2/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@@ -55,7 -55,7 +55,7 @@@ core-$(CONFIG_IA64_XEN_GUEST) += arch/i
   core-$(CONFIG_IA64_SGI_SN2)   += arch/ia64/sn/
   core-$(CONFIG_IA64_SGI_UV)    += arch/ia64/uv/
   core-$(CONFIG_KVM)            += arch/ia64/kvm/
--core-$(CONFIG_XEN)            += arch/ia64/xen/
++core-$(CONFIG_PARAVIRT_XEN)   += arch/ia64/xen/
   
   drivers-$(CONFIG_PCI)         += arch/ia64/pci/
   drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/
diff --cc arch/ia64/include/asm/xen/hypervisor.h

index 67455c2,67455c2..aacad12
--- 1/arch/ia64/include/asm/xen/hypervisor.h
--- 2/arch/ia64/include/asm/xen/hypervisor.h
+++ b/arch/ia64/include/asm/xen/hypervisor.h
@@@ -34,13 -34,13 +34,13 @@@
   #define _ASM_IA64_XEN_HYPERVISOR_H
   
   #include <linux/err.h>
++#include <xen/xen.h>
++#ifdef CONFIG_PARAVIRT_XEN
   #include <xen/interface/xen.h>
   #include <xen/interface/version.h>    /* to compile feature.c */
   #include <xen/features.h>             /* to comiple xen-netfront.c */
--#include <xen/xen.h>
   #include <asm/xen/hypercall.h>
   
--#ifdef CONFIG_XEN
   extern struct shared_info *HYPERVISOR_shared_info;
   extern struct start_info *xen_start_info;
   
diff --cc arch/ia64/include/asm/xen/interface.h

index e951e74,e951e74..5d94af7
--- 1/arch/ia64/include/asm/xen/interface.h
--- 2/arch/ia64/include/asm/xen/interface.h
+++ b/arch/ia64/include/asm/xen/interface.h
@@@ -56,29 -56,29 +56,21 @@@
   #ifndef _ASM_IA64_XEN_INTERFACE_H
   #define _ASM_IA64_XEN_INTERFACE_H
   
--#define __DEFINE_GUEST_HANDLE(name, type)     \
++#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
         typedef struct { type *p; } __guest_handle_ ## name
   
   #define DEFINE_GUEST_HANDLE_STRUCT(name)      \
--      __DEFINE_GUEST_HANDLE(name, struct name)
--#define DEFINE_GUEST_HANDLE(name)     __DEFINE_GUEST_HANDLE(name, name)
--#define GUEST_HANDLE(name)            __guest_handle_ ## name
--#define GUEST_HANDLE_64(name)         GUEST_HANDLE(name)
++      __DEFINE_XEN_GUEST_HANDLE(name, struct name)
++#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
++#define XEN_GUEST_HANDLE(name)                __guest_handle_ ## name
++#define XEN_GUEST_HANDLE_64(name)     XEN_GUEST_HANDLE(name)
   #define set_xen_guest_handle(hnd, val)        do { (hnd).p = val; } while (0)
   
   #ifndef __ASSEMBLY__
--/* Guest handles for primitive C types. */
--__DEFINE_GUEST_HANDLE(uchar, unsigned char);
--__DEFINE_GUEST_HANDLE(uint, unsigned int);
--__DEFINE_GUEST_HANDLE(ulong, unsigned long);
--__DEFINE_GUEST_HANDLE(u64, unsigned long);
--DEFINE_GUEST_HANDLE(char);
--DEFINE_GUEST_HANDLE(int);
--DEFINE_GUEST_HANDLE(long);
--DEFINE_GUEST_HANDLE(void);
++__DEFINE_XEN_GUEST_HANDLE(u64, unsigned long);
   
++typedef unsigned long xen_ulong_t;
   typedef unsigned long xen_pfn_t;
--DEFINE_GUEST_HANDLE(xen_pfn_t);
   #define PRI_xen_pfn   "lx"
   #endif
   
@@@ -90,7 -90,7 +82,7 @@@
   /* Maximum number of virtual CPUs in multi-processor guests. */
   /* keep sizeof(struct shared_page) <= PAGE_SIZE.
    * this is checked in arch/ia64/xen/hypervisor.c. */
--#define MAX_VIRT_CPUS 64
++#define XEN_LEGACY_MAX_VCPUS 64
   
   #ifndef __ASSEMBLY__
   
diff --cc arch/ia64/kernel/asm-offsets.c

index af56501,af56501..166ced4
--- 1/arch/ia64/kernel/asm-offsets.c
--- 2/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@@ -290,7 -290,7 +290,7 @@@ void foo(void
         DEFINE(IA64_ITC_LASTCYCLE_OFFSET,
                 offsetof (struct itc_jitter_data_t, itc_lastcycle));
   
--#ifdef CONFIG_XEN
++#ifdef CONFIG_PARAVIRT_XEN
         BLANK();
   
         DEFINE(XEN_NATIVE_ASM, XEN_NATIVE);
diff --cc arch/ia64/kernel/vmlinux.lds.S

index 53c0ba0,787de4a..7d5aa7f
--- 1/arch/ia64/kernel/vmlinux.lds.S
--- 2/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@@ -183,7 -183,7 +183,7 @@@ SECTIONS 
                 __start_gate_section = .;
                 *(.data..gate)
                 __stop_gate_section = .;
--#ifdef CONFIG_XEN
++#ifdef CONFIG_PARAVIRT_XEN
                 . = ALIGN(PAGE_SIZE);
                 __xen_start_gate_section = .;
                 *(.data..gate.xen)
diff --cc arch/ia64/xen/Kconfig

index 515e082,515e082..14d8ac6
--- 1/arch/ia64/xen/Kconfig
--- 2/arch/ia64/xen/Kconfig
+++ b/arch/ia64/xen/Kconfig
@@@ -2,7 -2,7 +2,7 @@@
   # This Kconfig describes xen/ia64 options
   #
   
--config XEN
++config PARAVIRT_XEN
         bool "Xen hypervisor support"
         default y
         depends on PARAVIRT && MCKINLEY && IA64_PAGE_SIZE_16KB && EXPERIMENTAL
@@@ -16,10 -16,10 +16,6 @@@
           Enable Xen hypervisor support.  Resulting kernel runs
           both as a guest OS on Xen and natively on hardware.
   
--config XEN_XENCOMM
--      depends on XEN
--      bool
--
   config NO_IDLE_HZ
--      depends on XEN
++      depends on PARAVIRT_XEN
         bool
diff --cc arch/ia64/xen/xcom_hcall.c

index ccaf743,ccaf743..7690fc3
--- 1/arch/ia64/xen/xcom_hcall.c
--- 2/arch/ia64/xen/xcom_hcall.c
+++ b/arch/ia64/xen/xcom_hcall.c
@@@ -343,7 -343,7 +343,7 @@@ xencommize_memory_reservation(struct xe
   int
   xencomm_hypercall_memory_op(unsigned int cmd, void *arg)
   {
--      GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} };
++      XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} };
         struct xen_memory_reservation *xmr = NULL;
         int rc;
         struct xencomm_handle *desc;
diff --cc arch/powerpc/kernel/prom_init.c
Simple merge
diff --cc arch/powerpc/platforms/pseries/setup.c
Simple merge
diff --cc arch/powerpc/xmon/xmon.c
Simple merge
diff --cc arch/s390/Kconfig
Simple merge
diff --cc arch/x86/Kbuild

index 0e9dec6,0e10323..30f3ffc
--- 1/arch/x86/Kbuild
--- 2/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@@ -2,7 -2,7 +2,7 @@@
   obj-$(CONFIG_KVM) += kvm/
   
   # Xen paravirtualization support
--obj-$(CONFIG_XEN) += xen/
++obj-$(CONFIG_PARAVIRT_XEN) += xen/
   
   # lguest paravirtualization support
   obj-$(CONFIG_LGUEST_GUEST) += lguest/
diff --cc arch/x86/Kconfig

index 686c6d7,cc6c53a..53625c1
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -39,8 -40,8 +40,8 @@@ config X8
         select HAVE_FUNCTION_TRACE_MCOUNT_TEST
         select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
         select HAVE_SYSCALL_TRACEPOINTS
--      select HAVE_KVM
--      select HAVE_ARCH_KGDB
++      select HAVE_KVM if !XEN
++      select HAVE_ARCH_KGDB if !XEN
         select HAVE_ARCH_TRACEHOOK
         select HAVE_GENERIC_DMA_COHERENT if X86_32
         select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@@ -48,14 -49,14 +49,14 @@@
         select HAVE_REGS_AND_STACK_ACCESS_API
         select HAVE_DMA_API_DEBUG
         select HAVE_KERNEL_GZIP
--      select HAVE_KERNEL_BZIP2
--      select HAVE_KERNEL_LZMA
--      select HAVE_KERNEL_XZ
--      select HAVE_KERNEL_LZO
++      select HAVE_KERNEL_BZIP2 if !XEN
++      select HAVE_KERNEL_LZMA if !XEN
++      select HAVE_KERNEL_XZ if !XEN
++      select HAVE_KERNEL_LZO if !XEN
         select HAVE_HW_BREAKPOINT
         select HAVE_MIXED_BREAKPOINTS_REGS
         select PERF_EVENTS
--      select HAVE_PERF_EVENTS_NMI
++      select HAVE_PERF_EVENTS_NMI if !XEN
         select ANON_INODES
         select HAVE_ARCH_KMEMCHECK
         select HAVE_USER_RETURN_NOTIFIER
@@@ -89,6 -91,6 +91,7 @@@ config GENERIC_CMOS_UPDAT
   
   config CLOCKSOURCE_WATCHDOG
         def_bool y
++      depends on !XEN
   
   config GENERIC_CLOCKEVENTS
         def_bool y
@@@ -96,6 -98,6 +99,7 @@@
   config GENERIC_CLOCKEVENTS_BROADCAST
         def_bool y
         depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
++      depends on !XEN
   
   config LOCKDEP_SUPPORT
         def_bool y
@@@ -123,7 -118,7 +120,7 @@@ config SBU
         bool
   
   config NEED_DMA_MAP_STATE
--       def_bool (X86_64 || DMAR || DMA_API_DEBUG)
++       def_bool (X86_64 || DMAR || DMA_API_DEBUG || SWIOTLB)
   
   config NEED_SG_DMA_LENGTH
         def_bool y
@@@ -190,6 -185,6 +187,7 @@@ config HAVE_CPUMASK_OF_CPU_MA
   
   config ARCH_HIBERNATION_POSSIBLE
         def_bool y
++      depends on !XEN
   
   config ARCH_SUSPEND_POSSIBLE
         def_bool y
@@@ -225,7 -220,7 +223,15 @@@ config X86_64_SM
   
   config X86_HT
         def_bool y
--      depends on SMP
++      depends on SMP && !XEN
++
++config X86_NO_TSS
++      def_bool y
++      depends on XEN
++
++config X86_NO_IDT
++      def_bool y
++      depends on XEN
   
   config X86_32_LAZY_GS
         def_bool y
@@@ -241,7 -236,7 +247,7 @@@ config KTIME_SCALA
   
   config ARCH_CPU_PROBE_RELEASE
         def_bool y
--      depends on HOTPLUG_CPU
++      depends on HOTPLUG_CPU && !XEN
   
   source "init/Kconfig"
   source "kernel/Kconfig.freezer"
@@@ -297,13 -292,13 +303,24 @@@ config X86_MPPARS
           For old smp systems that do not have proper acpi support. Newer systems
           (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
   
++config X86_XEN
++      bool "Xen-compatible"
++      depends on X86_32
++      select XEN
++      select X86_PAE
++      select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST
++      select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST
++      help
++        Choose this option if you plan to run this kernel on top of the
++        Xen Hypervisor.
++
   config X86_BIGSMP
         bool "Support for big SMP systems with more than 8 CPUs"
--      depends on X86_32 && SMP
++      depends on X86_32 && SMP && !XEN
         ---help---
           This option is needed for the systems that have more than 8 CPUs
   
--if X86_32
++if X86_32 && !XEN
   config X86_EXTENDED_PLATFORM
         bool "Support for extended (non-PC) x86 platforms"
         default y
@@@ -326,7 -321,7 +343,14 @@@
           generic distribution kernel, say Y here - otherwise say N.
   endif
   
--if X86_64
++config X86_64_XEN
++      bool "Enable Xen compatible kernel"
++      depends on X86_64
++      select XEN
++      help
++        This option will compile a kernel compatible with Xen hypervisor
++
++if X86_64 && !XEN
   config X86_EXTENDED_PLATFORM
         bool "Support for extended (non-PC) x86 platforms"
         default y
@@@ -479,7 -485,7 +514,7 @@@ config X86_ES700
   
   config X86_32_IRIS
         tristate "Eurobraille/Iris poweroff module"
--      depends on X86_32
++      depends on X86_32 && !XEN
         ---help---
           The Iris machines from EuroBraille do not have APM or ACPI support
           to shut themselves down properly.  A special I/O sequence is
@@@ -504,6 -510,6 +539,7 @@@ config SCHED_OMIT_FRAME_POINTE
   
   menuconfig PARAVIRT_GUEST
         bool "Paravirtualized guest support"
++      depends on !XEN
         ---help---
           Say Y here to get to see options related to running Linux under
           various hypervisors.  This option alone does not add any kernel code.
@@@ -572,6 -578,6 +608,7 @@@ config NO_BOOTME
   
   config MEMTEST
         bool "Memtest"
++      depends on !XEN
         ---help---
           This option adds a kernel parameter 'memtest', which allows memtest
           to be set.
@@@ -594,6 -600,6 +631,7 @@@ source "arch/x86/Kconfig.cpu
   config HPET_TIMER
         def_bool X86_64
         prompt "HPET Timer Support" if X86_32
++      depends on !XEN
         ---help---
           Use the IA-PC HPET (High Precision Event Timer) to manage
           time in preference to the PIT and RTC, if a HPET is
@@@ -629,6 -635,6 +667,7 @@@ config APB_TIME
   config DMI
         default y
         bool "Enable DMI scanning" if EXPERT
++      depends on !XEN_UNPRIVILEGED_GUEST
         ---help---
           Enabled scanning of DMI to identify machine quirks. Say Y
           here unless you have verified that your setup is not
@@@ -639,7 -645,7 +678,7 @@@ config GART_IOMM
         bool "GART IOMMU support" if EXPERT
         default y
         select SWIOTLB
--      depends on X86_64 && PCI && AMD_NB
++      depends on X86_64 && PCI && AMD_NB && !X86_64_XEN
         ---help---
           Support for full DMA access of devices with 32bit memory access only
           on systems with more than 3GB. This is usually needed for USB,
@@@ -654,7 -660,7 +693,7 @@@
   config CALGARY_IOMMU
         bool "IBM Calgary IOMMU support"
         select SWIOTLB
--      depends on X86_64 && PCI && EXPERIMENTAL
++      depends on X86_64 && PCI && !X86_64_XEN && EXPERIMENTAL
         ---help---
           Support for hardware IOMMUs in IBM's xSeries x366 and x460
           systems. Needed to run systems with more than 3GB of memory
@@@ -684,8 -690,7 +723,7 @@@ config AMD_IOMM
         bool "AMD IOMMU support"
         select SWIOTLB
         select PCI_MSI
-       select PCI_IOV
--      depends on X86_64 && PCI && ACPI
++      depends on X86_64 && PCI && ACPI && !XEN
         ---help---
           With this option you can enable support for AMD IOMMU hardware in
           your system. An IOMMU is a hardware component which provides
@@@ -709,7 -714,7 +747,8 @@@ config AMD_IOMMU_STAT
   
   # need this always selected by IOMMU for the VIA workaround
   config SWIOTLB
--      def_bool y if X86_64
++      def_bool y if X86_64 || XEN
++      prompt "Software I/O TLB" if XEN_UNPRIVILEGED_GUEST && !XEN_PCIDEV_FRONTEND
         ---help---
           Support for software bounce buffers used on x86-64 systems
           which don't have a hardware IOMMU (e.g. the current generation
@@@ -733,11 -738,11 +772,12 @@@ config MAXSM
   
   config NR_CPUS
         int "Maximum number of CPUs" if SMP && !MAXSMP
--      range 2 8 if SMP && X86_32 && !X86_BIGSMP
++      range 2 8 if SMP && X86_32 && !X86_BIGSMP && !X86_XEN
         range 2 512 if SMP && !MAXSMP
         default "1" if !SMP
         default "4096" if MAXSMP
         default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
++      default "16" if X86_64_XEN
         default "8" if SMP
         ---help---
           This allows you to specify the maximum number of CPUs which this
@@@ -780,7 -785,7 +820,7 @@@ source "kernel/Kconfig.preempt
   
   config X86_UP_APIC
         bool "Local APIC support on uniprocessors"
--      depends on X86_32 && !SMP && !X86_32_NON_STANDARD
++      depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !XEN_UNPRIVILEGED_GUEST
         ---help---
           A local APIC (Advanced Programmable Interrupt Controller) is an
           integrated interrupt controller in the CPU. If you have a single-CPU
@@@ -806,10 -811,10 +846,12 @@@ config X86_UP_IOAPI
   config X86_LOCAL_APIC
         def_bool y
         depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
++      depends on !XEN_UNPRIVILEGED_GUEST
   
   config X86_IO_APIC
         def_bool y
         depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
++      depends on !XEN_UNPRIVILEGED_GUEST
   
   config X86_VISWS_APIC
         def_bool y
@@@ -817,7 -822,7 +859,7 @@@
   
   config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
         bool "Reroute for broken boot IRQs"
--      depends on X86_IO_APIC
++      depends on X86_IO_APIC && !XEN
         ---help---
           This option enables a workaround that fixes a source of
           spurious interrupts. This is recommended when threaded
@@@ -840,6 -845,6 +882,7 @@@
   
   config X86_MCE
         bool "Machine Check / overheating reporting"
++      depends on !XEN_UNPRIVILEGED_GUEST
         ---help---
           Machine Check support allows the processor to notify the
           kernel if it detects a problem (e.g. overheating, data corruption).
@@@ -849,7 -854,7 +892,7 @@@
   config X86_MCE_INTEL
         def_bool y
         prompt "Intel MCE features"
--      depends on X86_MCE && X86_LOCAL_APIC
++      depends on X86_MCE && X86_LOCAL_APIC && !XEN
         ---help---
            Additional support for intel specific MCE features such as
            the thermal monitor.
@@@ -857,14 -862,14 +900,14 @@@
   config X86_MCE_AMD
         def_bool y
         prompt "AMD MCE features"
--      depends on X86_MCE && X86_LOCAL_APIC
++      depends on X86_MCE && X86_LOCAL_APIC && !XEN
         ---help---
            Additional support for AMD specific MCE features such as
            the DRAM Error Threshold.
   
   config X86_ANCIENT_MCE
         bool "Support for old Pentium 5 / WinChip machine checks"
--      depends on X86_32 && X86_MCE
++      depends on X86_32 && X86_MCE && !XEN
         ---help---
           Include support for machine check handling on old Pentium 5 or WinChip
           systems. These typically need to be enabled explicitely on the command
@@@ -882,6 -887,6 +925,10 @@@ config X86_MCE_INJEC
           If you don't know what a machine check is and you don't do kernel
           QA it is safe to say n.
   
++config X86_XEN_MCE
++      def_bool y
++      depends on XEN && X86_MCE
++
   config X86_THERMAL_VECTOR
         def_bool y
         depends on X86_MCE_INTEL
@@@ -935,7 -939,7 +981,7 @@@ config I8
   
   config X86_REBOOTFIXUPS
         bool "Enable X86 board specific fixups for reboot"
--      depends on X86_32
++      depends on X86_32 && !XEN
         ---help---
           This enables chipset and/or board specific fixups to be done
           in order to get reboot to work correctly. This is only needed on
@@@ -952,6 -956,6 +998,7 @@@
   
   config MICROCODE
         tristate "/dev/cpu/microcode - microcode support"
++      depends on !XEN_UNPRIVILEGED_GUEST
         select FW_LOADER
         ---help---
           If you say Y here, you will be able to update the microcode on
@@@ -970,7 -974,7 +1017,7 @@@
   
   config MICROCODE_INTEL
         bool "Intel microcode patch loading support"
--      depends on MICROCODE
++      depends on MICROCODE && !XEN
         default MICROCODE
         select FW_LOADER
         ---help---
@@@ -983,7 -987,7 +1030,7 @@@
   
   config MICROCODE_AMD
         bool "AMD microcode patch loading support"
--      depends on MICROCODE
++      depends on MICROCODE && !XEN
         select FW_LOADER
         ---help---
           If you select this option, microcode patch loading support for AMD
@@@ -995,6 -999,6 +1042,7 @@@ config MICROCODE_OLD_INTERFAC
   
   config X86_MSR
         tristate "/dev/cpu/*/msr - Model-specific register support"
++      select XEN_DOMCTL if XEN_PRIVILEGED_GUEST
         ---help---
           This device gives privileged processes access to the x86
           Model-Specific Registers (MSRs).  It is a character device with
@@@ -1012,7 -1016,7 +1060,7 @@@ config X86_CPUI
   
   choice
         prompt "High Memory Support"
--      default HIGHMEM64G if X86_NUMAQ
++      default HIGHMEM64G if X86_NUMAQ || XEN
         default HIGHMEM4G
         depends on X86_32
   
@@@ -1055,7 -1059,7 +1103,7 @@@ config NOHIGHME
   
   config HIGHMEM4G
         bool "4GB"
--      depends on !X86_NUMAQ
++      depends on !X86_NUMAQ && !XEN
         ---help---
           Select this if you have a 32-bit processor and between 1 and 4
           gigabytes of physical RAM.
@@@ -1131,12 -1135,12 +1179,12 @@@ config ARCH_PHYS_ADDR_T_64BI
         def_bool X86_64 || X86_PAE
   
   config ARCH_DMA_ADDR_T_64BIT
--      def_bool X86_64 || HIGHMEM64G
++      def_bool X86_64 || XEN || HIGHMEM64G
   
   config DIRECT_GBPAGES
         bool "Enable 1GB pages for kernel pagetables" if EXPERT
         default y
--      depends on X86_64
++      depends on X86_64 && !XEN
         ---help---
           Allow the kernel linear mapping to use 1GB pages on CPUs that
           support it. This can improve the kernel's performance a tiny bit by
@@@ -1145,7 -1149,7 +1193,7 @@@
   # Common NUMA Features
   config NUMA
         bool "Numa Memory Allocation and Scheduler Support"
--      depends on SMP
++      depends on SMP && !XEN
         depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
         default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
         ---help---
@@@ -1243,9 -1247,17 +1291,18 @@@ config ARCH_DISCONTIGMEM_DEFAUL
         def_bool y
         depends on NUMA && X86_32
   
+ config ARCH_PROC_KCORE_TEXT
+       def_bool y
+       depends on X86_64 && PROC_KCORE
+ 
+ config ARCH_SPARSEMEM_DEFAULT
+       def_bool y
- -      depends on X86_64
++      depends on X86_64 && !X86_64_XEN
+ 
   config ARCH_SPARSEMEM_ENABLE
         def_bool y
         depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
++      depends on !XEN
         select SPARSEMEM_STATIC if X86_32
         select SPARSEMEM_VMEMMAP_ENABLE if X86_64
   
@@@ -1283,6 -1287,6 +1332,7 @@@ config HIGHPT
   
   config X86_CHECK_BIOS_CORRUPTION
         bool "Check for low memory corruption"
++      depends on !XEN
         ---help---
           Periodically check for memory corruption in low memory, which
           is suspected to be caused by BIOS.  Even when enabled in the
@@@ -1313,6 -1317,6 +1363,7 @@@ config X86_BOOTPARAM_MEMORY_CORRUPTION_
   
   config X86_RESERVE_LOW
         int "Amount of low memory, in kilobytes, to reserve for the BIOS"
++      depends on !XEN
         default 64
         range 4 640
         ---help---
@@@ -1343,6 -1347,6 +1394,7 @@@
   config MATH_EMULATION
         bool
         prompt "Math emulation" if X86_32
++      depends on !XEN
         ---help---
           Linux can emulate a math coprocessor (used for floating point
           operations) if you don't have one. 486DX and Pentium processors have
@@@ -1369,6 -1373,6 +1421,7 @@@
   config MTRR
         def_bool y
         prompt "MTRR (Memory Type Range Register) support" if EXPERT
++      depends on !XEN_UNPRIVILEGED_GUEST
         ---help---
           On Intel P6 family processors (Pentium Pro, Pentium II and later)
           the Memory Type Range Registers (MTRRs) may be used to control
@@@ -1404,7 -1408,7 +1457,7 @@@
   config MTRR_SANITIZER
         def_bool y
         prompt "MTRR cleanup support"
--      depends on MTRR
++      depends on MTRR && !XEN
         ---help---
           Convert MTRR layout from continuous to discrete, so X drivers can
           add writeback entries.
@@@ -1434,8 -1438,8 +1487,8 @@@ config MTRR_SANITIZER_SPARE_REG_NR_DEFA
   
   config X86_PAT
         def_bool y
--      prompt "x86 PAT support" if EXPERT
--      depends on MTRR
++      prompt "x86 PAT support" if EXPERT || XEN_UNPRIVILEGED_GUEST
++      depends on MTRR || (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND)
         ---help---
           Use PAT attributes to setup page level cache control.
   
@@@ -1453,7 -1457,7 +1506,7 @@@ config ARCH_USES_PG_UNCACHE
   
   config EFI
         bool "EFI runtime service support"
--      depends on ACPI
++      depends on ACPI && !XEN
         ---help---
           This enables the kernel to use EFI runtime services that are
           available (such as the EFI variable services).
@@@ -1501,6 -1505,6 +1554,7 @@@ source kernel/Kconfig.h
   
   config KEXEC
         bool "kexec system call"
++      depends on !XEN_UNPRIVILEGED_GUEST
         ---help---
           kexec is a system call that implements the ability to shutdown your
           current kernel, and to start another kernel.  It is like a reboot
@@@ -1518,6 -1522,6 +1572,7 @@@
   config CRASH_DUMP
         bool "kernel crash dumps"
         depends on X86_64 || (X86_32 && HIGHMEM)
++      depends on !XEN
         ---help---
           Generate crash dump after being started by kexec.
           This should be normally only set in special crash dump kernels
@@@ -1538,7 -1542,7 +1593,8 @@@ config KEXEC_JUM
           code in physical address mode via KEXEC
   
   config PHYSICAL_START
--      hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
++      hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP || XEN)
++      default 0x100000 if XEN
         default "0x1000000"
         ---help---
           This gives the physical address where the kernel is loaded.
@@@ -1580,6 -1584,6 +1636,7 @@@
   
   config RELOCATABLE
         bool "Build a relocatable kernel"
++      depends on !XEN
         default y
         ---help---
           This builds a kernel image that retains relocation information
@@@ -1601,7 -1605,7 +1658,8 @@@ config X86_NEED_RELOC
         depends on X86_32 && RELOCATABLE
   
   config PHYSICAL_ALIGN
--      hex "Alignment value to which kernel should be aligned" if X86_32
++      hex "Alignment value to which kernel should be aligned" if X86_32 && !XEN
++      default 0x2000 if XEN
         default "0x1000000"
         range 0x2000 0x1000000
         ---help---
@@@ -1694,6 -1698,6 +1752,7 @@@ endmen
   config ARCH_ENABLE_MEMORY_HOTPLUG
         def_bool y
         depends on X86_64 || (X86_32 && HIGHMEM)
++      depends on !XEN
   
   config ARCH_ENABLE_MEMORY_HOTREMOVE
         def_bool y
@@@ -1711,6 -1719,6 +1774,8 @@@ config ARCH_HIBERNATION_HEADE
   
   source "kernel/power/Kconfig"
   
++if !XEN_UNPRIVILEGED_GUEST
++
   source "drivers/acpi/Kconfig"
   
   source "drivers/sfi/Kconfig"
@@@ -1721,7 -1729,7 +1786,7 @@@ config X86_APM_BOO
   
   menuconfig APM
         tristate "APM (Advanced Power Management) BIOS support"
--      depends on X86_32 && PM_SLEEP
++      depends on X86_32 && PM_SLEEP && !XEN
         ---help---
           APM is a BIOS specification for saving power using several different
           techniques. This is mostly useful for battery powered laptops with
@@@ -1846,6 -1854,6 +1911,8 @@@ source "drivers/cpuidle/Kconfig
   
   source "drivers/idle/Kconfig"
   
++endif # !XEN_UNPRIVILEGED_GUEST
++
   endmenu
   
   
@@@ -1855,6 -1863,6 +1922,7 @@@ config PC
         bool "PCI support"
         default y
         select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
++      select ARCH_SUPPORTS_MSI if (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND)
         ---help---
           Find out whether you have a PCI motherboard. PCI is the name of a
           bus system, i.e. the way the CPU talks to the other stuff inside
@@@ -1882,25 -1890,25 +1950,36 @@@ choic
   
   config PCI_GOBIOS
         bool "BIOS"
++      depends on !XEN
   
   config PCI_GOMMCONFIG
         bool "MMConfig"
++      depends on !XEN_UNPRIVILEGED_GUEST
   
   config PCI_GODIRECT
         bool "Direct"
++      depends on !XEN_UNPRIVILEGED_GUEST
   
   config PCI_GOOLPC
         bool "OLPC XO-1"
--      depends on OLPC
++      depends on OLPC && !XEN_UNPRIVILEGED_GUEST
++
++config PCI_GOXEN_FE
++      bool "Xen PCI Frontend"
++      depends on X86_XEN
++      help
++        The PCI device frontend driver allows the kernel to import arbitrary
++        PCI devices from a PCI backend to support PCI driver domains.
   
   config PCI_GOANY
         bool "Any"
++      depends on !XEN_UNPRIVILEGED_GUEST
   
   endchoice
   
   config PCI_BIOS
         def_bool y
--      depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY)
++      depends on X86_32 && PCI && !XEN && (PCI_GOBIOS || PCI_GOANY)
   
   # x86-64 doesn't support PCI BIOS access from long mode so always go direct.
   config PCI_DIRECT
@@@ -1917,7 -1925,7 +1996,7 @@@ config PCI_OLP
   
   config PCI_XEN
         def_bool y
--      depends on PCI && XEN
++      depends on PCI && PARAVIRT_XEN
         select SWIOTLB_XEN
   
   config PCI_DOMAINS
@@@ -1944,7 -1952,7 +2023,7 @@@ config PCI_CNB20LE_QUIR
   
   config DMAR
         bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
--      depends on PCI_MSI && ACPI && EXPERIMENTAL
++      depends on PCI_MSI && ACPI && !XEN && EXPERIMENTAL
         help
           DMA remapping (DMAR) devices support enables independent address
           translations for Direct Memory Access (DMA) from devices.
@@@ -1985,7 -1993,7 +2064,7 @@@ config DMAR_FLOPPY_W
   
   config INTR_REMAP
         bool "Support for Interrupt Remapping (EXPERIMENTAL)"
--      depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
++      depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && !XEN && EXPERIMENTAL
         ---help---
           Supports Interrupt remapping for IO-APIC and MSI devices.
           To use x2apic mode in the CPU's which support x2APIC enhancements or
@@@ -2007,6 -2015,6 +2086,7 @@@ if X86_3
   
   config ISA
         bool "ISA support"
++      depends on !XEN
         ---help---
           Find out whether you have ISA slots on your motherboard.  ISA is the
           name of a bus system, i.e. the way the CPU talks to the other stuff
@@@ -2034,6 -2042,6 +2114,7 @@@ source "drivers/eisa/Kconfig
   
   config MCA
         bool "MCA support"
++      depends on !XEN
         ---help---
           MicroChannel Architecture is found in some IBM PS/2 machines and
           laptops.  It is a bus system similar to PCI or ISA. See
@@@ -2065,10 -2073,10 +2146,10 @@@ config SCx200HR_TIME
   
   config OLPC
         bool "One Laptop Per Child support"
--      depends on !X86_PAE
++      depends on !X86_PAE && !XEN
         select GPIOLIB
         select OF
-       select OF_PROMTREE
+       select OF_PROMTREE if PROC_DEVICETREE
         ---help---
           Add support for detecting the unique features of the OLPC
           XO hardware.
@@@ -2083,7 -2091,7 +2164,7 @@@ endif # X86_3
   
   config AMD_NB
         def_bool y
--      depends on CPU_SUP_AMD && PCI
++      depends on CPU_SUP_AMD && PCI && !XEN_UNPRIVILEGED_GUEST
   
   source "drivers/pcmcia/Kconfig"
   
@@@ -2153,7 -2161,7 +2234,9 @@@ source "net/Kconfig
   
   source "drivers/Kconfig"
   
++if !XEN_UNPRIVILEGED_GUEST
   source "drivers/firmware/Kconfig"
++endif
   
   source "fs/Kconfig"
   
diff --cc arch/x86/Kconfig.cpu

index 6a7cfdf,d161e93..97fe690
--- 1/arch/x86/Kconfig.cpu
--- 2/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@@ -6,7 -8,7 +8,7 @@@ choic
   
   config M386
         bool "386"
--      depends on X86_32 && !UML
++      depends on X86_32 && !UML && !XEN
         ---help---
           This is the processor type of your CPU. This information is used for
           optimizing purposes. In order to compile a kernel that can run on
@@@ -47,7 -49,7 +49,7 @@@
   
   config M486
         bool "486"
--      depends on X86_32
++      depends on X86_32 && !XEN
         ---help---
           Select this for a 486 series processor, either Intel or one of the
           compatible processors from AMD, Cyrix, IBM, or Intel.  Includes DX,
@@@ -56,7 -58,7 +58,7 @@@
   
   config M586
         bool "586/K5/5x86/6x86/6x86MX"
--      depends on X86_32
++      depends on X86_32 && !XEN
         ---help---
           Select this for an 586 or 686 series processor such as the AMD K5,
           the Cyrix 5x86, 6x86 and 6x86MX.  This choice does not
@@@ -64,14 -66,14 +66,14 @@@
   
   config M586TSC
         bool "Pentium-Classic"
--      depends on X86_32
++      depends on X86_32 && !XEN
         ---help---
           Select this for a Pentium Classic processor with the RDTSC (Read
           Time Stamp Counter) instruction for benchmarking.
   
   config M586MMX
         bool "Pentium-MMX"
--      depends on X86_32
++      depends on X86_32 && !XEN
         ---help---
           Select this for a Pentium with the MMX graphics/multimedia
           extended instructions.
@@@ -399,6 -395,6 +395,7 @@@ config X86_P6_NO
   config X86_TSC
         def_bool y
         depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64
++      depends on !XEN
   
   config X86_CMPXCHG64
         def_bool y
@@@ -444,7 -440,7 +441,7 @@@ config CPU_SUP_INTE
   config CPU_SUP_CYRIX_32
         default y
         bool "Support Cyrix processors" if PROCESSOR_SELECT
--      depends on !64BIT
++      depends on !64BIT && !XEN
         ---help---
           This enables detection, tunings and quirks for Cyrix processors
   
@@@ -498,7 -494,7 +495,7 @@@ config CPU_SUP_TRANSMETA_3
   config CPU_SUP_UMC_32
         default y
         bool "Support UMC processors" if PROCESSOR_SELECT
--      depends on !64BIT
++      depends on !64BIT && !XEN
         ---help---
           This enables detection, tunings and quirks for UMC processors
   
diff --cc arch/x86/Kconfig.debug

index c0f8a5c,615e188..9ce06c1
--- 1/arch/x86/Kconfig.debug
--- 2/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@@ -25,6 -25,6 +25,7 @@@ config STRICT_DEVME
   config X86_VERBOSE_BOOTUP
         bool "Enable verbose x86 bootup info messages"
         default y
++      depends on !XEN
         ---help---
           Enables the informational output from the decompression stage
           (e.g. bzImage) of the boot. If you disable this you will still
@@@ -119,7 -139,7 +140,7 @@@ config DEBUG_NX_TES
   config DOUBLEFAULT
         default y
         bool "Enable doublefault exception handler" if EXPERT
--      depends on X86_32
++      depends on X86_32 && !X86_NO_TSS
         ---help---
           This option allows trapping of rare doublefault exceptions that
           would otherwise cause a system to silently reboot. Disabling this
@@@ -159,6 -179,6 +180,7 @@@ config IOMMU_LEA
   
   config HAVE_MMIOTRACE_SUPPORT
         def_bool y
++      depends on !XEN
   
   config X86_DECODER_SELFTEST
         bool "x86 instruction decoder selftest"
@@@ -247,6 -267,6 +269,7 @@@ config DEBUG_BOOT_PARAM
         bool "Debug boot parameters"
         depends on DEBUG_KERNEL
         depends on DEBUG_FS
++      depends on !XEN
         ---help---
           This option will cause struct boot_params to be exported via debugfs.
   
diff --cc arch/x86/Makefile

index 724c051,b02e509..48595cb
--- 1/arch/x86/Makefile
--- 2/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@@ -151,9 -149,9 +151,28 @@@ boot := arch/x86/boo
   
   BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage
   
--PHONY += bzImage $(BOOT_TARGETS)
++PHONY += bzImage vmlinuz $(BOOT_TARGETS)
+ +
++ifdef CONFIG_XEN
++LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
++      -I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE)
++
++ifdef CONFIG_X86_64
++LDFLAGS_vmlinux := -e startup_64
++endif
+ 
+ # Default kernel to build
++all: vmlinuz
++
++# KBUILD_IMAGE specifies the target image being built
++KBUILD_IMAGE := $(boot)/vmlinuz
++
++vmlinuz: vmlinux
++      $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
++      $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
++      $(Q)ln -fsn ../../x86/boot/$@ $(objtree)/arch/$(UTS_MACHINE)/boot/$@
++else
+ +# Default kernel to build
   all: bzImage
   
   # KBUILD_IMAGE specify target image being built
@@@ -166,6 -164,6 +185,7 @@@ endi
         $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
         $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
         $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
++endif
   
   $(BOOT_TARGETS): vmlinux
         $(Q)$(MAKE) $(build)=$(boot) $@
diff --cc arch/x86/boot/Makefile

index f7cb086,f7cb086..be383e2
--- 1/arch/x86/boot/Makefile
--- 2/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@@ -23,6 -23,6 +23,7 @@@ ROOT_DEV      := CURREN
   SVGA_MODE     := -DSVGA_MODE=NORMAL_VGA
   
   targets               := vmlinux.bin setup.bin setup.elf bzImage
++targets               += vmlinuz vmlinux-stripped
   targets               += fdimage fdimage144 fdimage288 image.iso mtools.conf
   subdir-               := compressed
   
@@@ -195,6 -195,6 +196,20 @@@ bzlilo: $(obj)/bzImag
         cp System.map $(INSTALL_PATH)/
         if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
   
++$(obj)/vmlinuz: $(obj)/vmlinux-stripped FORCE
++      $(call if_changed,gzip)
++      @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
++
++$(obj)/vmlinux-stripped: OBJCOPYFLAGS := -g --strip-unneeded
++$(obj)/vmlinux-stripped: vmlinux FORCE
++      $(call if_changed,objcopy)
++
++ifndef CONFIG_XEN
++bzImage := bzImage
++else
++bzImage := vmlinuz
++endif
++
   install:
--      sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
++      sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/$(bzImage) \
                 System.map "$(INSTALL_PATH)"
diff --cc arch/x86/ia32/ia32entry-xen.S

index 0000000,0000000..8572d93

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/ia32/ia32entry-xen.S
@@@ -1,0 -1,0 +1,739 @@@
++/*
++ * Compatibility mode system call entry point for x86-64. 
++ *            
++ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
++ */            
++
++#include <asm/dwarf2.h>
++#include <asm/calling.h>
++#include <asm/asm-offsets.h>
++#include <asm/current.h>
++#include <asm/errno.h>
++#include <asm/ia32_unistd.h>  
++#include <asm/thread_info.h>  
++#include <asm/segment.h>
++#include <asm/irqflags.h>
++#include <linux/linkage.h>
++
++/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
++#include <linux/elf-em.h>
++#define AUDIT_ARCH_I386               (EM_386|__AUDIT_ARCH_LE)
++#define __AUDIT_ARCH_LE          0x40000000
++
++      .section .entry.text, "ax"
++
++#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
++
++      .macro IA32_ARG_FIXUP noebp=0
++      movl    %edi,%r8d
++      .if \noebp
++      jmp     .Lia32_common
++      .else
++      movl    %ebp,%r9d
++.Lia32_common:
++      .endif
++      xchg    %ecx,%esi
++      movl    %ebx,%edi
++      movl    %edx,%edx       /* zero extension */
++      .endm 
++
++      /* clobbers %eax */     
++      .macro  CLEAR_RREGS offset=0, _r9=rax
++      xorl    %eax,%eax
++      movq    %rax,\offset+R11(%rsp)
++      movq    %rax,\offset+R10(%rsp)
++      movq    %\_r9,\offset+R9(%rsp)
++      movq    %rax,\offset+R8(%rsp)
++      .endm
++
++      /*
++       * Reload arg registers from stack in case ptrace changed them.
++       * We don't reload %eax because syscall_trace_enter() returned
++       * the %rax value we should see.  Instead, we just truncate that
++       * value to 32 bits again as we did on entry from user mode.
++       * If it's a new value set by user_regset during entry tracing,
++       * this matches the normal truncation of the user-mode value.
++       * If it's -1 to make us punt the syscall, then (u32)-1 is still
++       * an appropriately invalid value.
++       */
++      .macro LOAD_ARGS32 offset, _r9=0
++      .if \_r9
++      movl \offset+16(%rsp),%r9d
++      .endif
++      movl \offset+40(%rsp),%ecx
++      movl \offset+48(%rsp),%edx
++      movl \offset+56(%rsp),%esi
++      movl \offset+64(%rsp),%edi
++      movl %eax,%eax                  /* zero extension */
++      .endm
++
++      .macro CFI_STARTPROC32 simple
++      CFI_STARTPROC   \simple
++      CFI_UNDEFINED   r8
++      CFI_UNDEFINED   r9
++      CFI_UNDEFINED   r10
++      CFI_UNDEFINED   r11
++      CFI_UNDEFINED   r12
++      CFI_UNDEFINED   r13
++      CFI_UNDEFINED   r14
++      CFI_UNDEFINED   r15
++      .endm
++
++#ifdef CONFIG_PARAVIRT
++ENTRY(native_usergs_sysret32)
++      swapgs
++      sysretl
++ENDPROC(native_usergs_sysret32)
++
++ENTRY(native_irq_enable_sysexit)
++      swapgs
++      sti
++      sysexit
++ENDPROC(native_irq_enable_sysexit)
++#endif
++
++/*
++ * 32bit SYSENTER instruction entry.
++ *
++ * Arguments:
++ * %eax       System call number.
++ * %ebx Arg1
++ * %ecx Arg2
++ * %edx Arg3
++ * %esi Arg4
++ * %edi Arg5
++ * %ebp user stack
++ * 0(%ebp) Arg6       
++ *    
++ * Interrupts on.
++ *    
++ * This is purely a fast path. For anything complicated we use the int 0x80
++ * path below.        Set up a complete hardware stack frame to share code
++ * with the int 0x80 path.
++ */   
++ENTRY(ia32_sysenter_target)
++      CFI_STARTPROC32 simple
++      CFI_SIGNAL_FRAME
++      CFI_DEF_CFA     rsp,SS+8-RIP+16
++      /*CFI_REL_OFFSET        ss,SS-RIP+16*/
++      CFI_REL_OFFSET  rsp,RSP-RIP+16
++      /*CFI_REL_OFFSET        rflags,EFLAGS-RIP+16*/
++      /*CFI_REL_OFFSET        cs,CS-RIP+16*/
++      CFI_REL_OFFSET  rip,RIP-RIP+16
++      CFI_REL_OFFSET  r11,8
++      CFI_REL_OFFSET  rcx,0
++      movq    8(%rsp),%r11
++      CFI_RESTORE     r11
++      popq_cfi %rcx
++      CFI_RESTORE     rcx
++      movl    %ebp,%ebp               /* zero extension */
++      movl    %eax,%eax
++      movl    48-THREAD_SIZE+TI_sysenter_return(%rsp),%r10d
++      movl    $__USER32_DS,40(%rsp)
++      movq    %rbp,32(%rsp)
++      movl    $__USER32_CS,16(%rsp)
++      movq    %r10,8(%rsp)
++      movq    %rax,(%rsp)
++      cld
++      SAVE_ARGS 0,0,1
++      /* no need to do an access_ok check here because rbp has been
++         32bit zero extended */ 
++1:    movl    (%rbp),%ebp
++      .section __ex_table,"a"
++      .quad 1b,ia32_badarg
++      .previous       
++      GET_THREAD_INFO(%r10)
++      orl    $TS_COMPAT,TI_status(%r10)
++      testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
++      jnz  sysenter_tracesys
++      jmp .Lia32_check_call
++
++#ifdef CONFIG_AUDITSYSCALL
++      .macro auditsys_entry_common
++      movl %esi,%r9d                  /* 6th arg: 4th syscall arg */
++      movl %edx,%r8d                  /* 5th arg: 3rd syscall arg */
++      /* (already in %ecx)               4th arg: 2nd syscall arg */
++      movl %ebx,%edx                  /* 3rd arg: 1st syscall arg */
++      movl %eax,%esi                  /* 2nd arg: syscall number */
++      movl $AUDIT_ARCH_I386,%edi      /* 1st arg: audit arch */
++      call audit_syscall_entry
++      movl RAX-ARGOFFSET(%rsp),%eax   /* reload syscall number */
++      cmpq $(IA32_NR_syscalls-1),%rax
++      ja ia32_badsys
++      movl %ebx,%edi                  /* reload 1st syscall arg */
++      movl RCX-ARGOFFSET(%rsp),%esi   /* reload 2nd syscall arg */
++      movl RDX-ARGOFFSET(%rsp),%edx   /* reload 3rd syscall arg */
++      movl RSI-ARGOFFSET(%rsp),%ecx   /* reload 4th syscall arg */
++      movl RDI-ARGOFFSET(%rsp),%r8d   /* reload 5th syscall arg */
++      .endm
++
++sysenter_auditsys:
++      auditsys_entry_common
++      movl %ebp,%r9d                  /* reload 6th syscall arg */
++      jmp .Lia32_dispatch
++#endif
++      CFI_ENDPROC
++ENDPROC(ia32_sysenter_target)
++
++/*
++ * 32bit SYSCALL instruction entry.
++ *
++ * Arguments:
++ * %eax       System call number.
++ * %ebx Arg1
++ * %ecx return EIP 
++ * %edx Arg3
++ * %esi Arg4
++ * %edi Arg5
++ * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
++ * %esp user stack 
++ * 0(%esp) Arg6
++ *    
++ * Interrupts on.
++ *    
++ * This is purely a fast path. For anything complicated we use the int 0x80
++ * path below.        Set up a complete hardware stack frame to share code
++ * with the int 0x80 path.    
++ */   
++ENTRY(ia32_cstar_target)
++      CFI_STARTPROC32 simple
++      CFI_SIGNAL_FRAME
++      CFI_DEF_CFA     rsp,SS+8-RIP+16
++      /*CFI_REL_OFFSET        ss,SS-RIP+16*/
++      CFI_REL_OFFSET  rsp,RSP-RIP+16
++      /*CFI_REL_OFFSET        rflags,EFLAGS-RIP+16*/
++      /*CFI_REL_OFFSET        cs,CS-RIP+16*/
++      CFI_REL_OFFSET  rip,RIP-RIP+16
++      movl    %eax,%eax       /* zero extension */
++      movl    RSP-RIP+16(%rsp),%r8d
++      SAVE_ARGS -8,1,1
++      movq    %rax,ORIG_RAX-ARGOFFSET(%rsp)
++      movq    %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
++      movl    %ebp,%ecx
++      movl    $__USER32_CS,CS-ARGOFFSET(%rsp)
++      movl    $__USER32_DS,SS-ARGOFFSET(%rsp)
++      /* no need to do an access_ok check here because r8 has been
++         32bit zero extended */ 
++      /* hardware stack frame is complete now */      
++1:    movl    (%r8),%r9d
++      .section __ex_table,"a"
++      .quad 1b,ia32_badarg
++      .previous       
++      GET_THREAD_INFO(%r10)
++      orl   $TS_COMPAT,TI_status(%r10)
++      testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
++      jnz   cstar_tracesys
++      cmpq $IA32_NR_syscalls-1,%rax
++      ja  ia32_badsys
++cstar_do_call:
++      IA32_ARG_FIXUP 1
++      
++#ifdef CONFIG_AUDITSYSCALL
++cstar_auditsys:
++      movl %r9d,R9-ARGOFFSET(%rsp)    /* register to be clobbered by call */
++      auditsys_entry_common
++      movl R9-ARGOFFSET(%rsp),%r9d    /* reload 6th syscall arg */
++      jmp .Lia32_dispatch
++#endif
++
++cstar_tracesys:
++#ifdef CONFIG_AUDITSYSCALL
++      testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
++      jz cstar_auditsys
++#endif
++      xchgl %r9d,%ebp
++      SAVE_REST
++      CLEAR_RREGS 0, r9
++      movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
++      movq %rsp,%rdi        /* &pt_regs -> arg1 */
++      call syscall_trace_enter
++      LOAD_ARGS32 ARGOFFSET, 1  /* reload args from stack in case ptrace changed it */
++      RESTORE_REST
++      xchgl %ebp,%r9d
++      cmpq $(IA32_NR_syscalls-1),%rax
++      ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
++      jmp cstar_do_call
++END(ia32_cstar_target)
++                              
++ia32_badarg:
++      movq $-EFAULT,%rax
++      jmp ia32_sysret
++      CFI_ENDPROC
++
++/* 
++ * Emulated IA32 system calls via int 0x80. 
++ *
++ * Arguments:  
++ * %eax       System call number.
++ * %ebx Arg1
++ * %ecx Arg2
++ * %edx Arg3
++ * %esi Arg4
++ * %edi Arg5
++ * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
++ *
++ * Notes:
++ * Uses the same stack frame as the x86-64 version.   
++ * All registers except %eax must be saved (but ptrace may violate that)
++ * Arguments are zero extended. For system calls that want sign extension and
++ * take long arguments a wrapper is needed. Most calls can just be called
++ * directly.
++ * Assumes it is only called from user space and entered with interrupts on.
++ */                           
++
++ENTRY(ia32_syscall)
++      CFI_STARTPROC32 simple
++      CFI_SIGNAL_FRAME
++      CFI_DEF_CFA     rsp,SS+8-RIP+16
++      /*CFI_REL_OFFSET        ss,SS-RIP+16*/
++      CFI_REL_OFFSET  rsp,RSP-RIP+16
++      /*CFI_REL_OFFSET        rflags,EFLAGS-RIP+16*/
++      /*CFI_REL_OFFSET        cs,CS-RIP+16*/
++      CFI_REL_OFFSET  rip,RIP-RIP+16
++      CFI_REL_OFFSET  r11,8
++      CFI_REL_OFFSET  rcx,0
++      movq 8(%rsp),%r11
++      CFI_RESTORE     r11
++      popq_cfi %rcx
++      CFI_RESTORE     rcx
++      movl %eax,%eax
++      movq %rax,(%rsp)
++      cld
++      /* note the registers are not zero extended to the sf.
++         this could be a problem. */
++      SAVE_ARGS 0,0,1
++      GET_THREAD_INFO(%r10)
++      orl   $TS_COMPAT,TI_status(%r10)
++      testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
++      jnz ia32_tracesys
++.Lia32_check_call:
++      cmpq $(IA32_NR_syscalls-1),%rax
++      ja ia32_badsys
++ia32_do_call:
++      IA32_ARG_FIXUP
++.Lia32_dispatch:
++      call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
++ia32_sysret:
++      movq %rax,RAX-ARGOFFSET(%rsp)
++      CLEAR_RREGS -ARGOFFSET
++      jmp int_ret_from_sys_call 
++
++sysenter_tracesys:
++#ifdef CONFIG_AUDITSYSCALL
++      testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
++      jz sysenter_auditsys
++#endif
++ia32_tracesys:                         
++      SAVE_REST
++      CLEAR_RREGS
++      movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
++      movq %rsp,%rdi        /* &pt_regs -> arg1 */
++      call syscall_trace_enter
++      LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
++      RESTORE_REST
++      cmpq $(IA32_NR_syscalls-1),%rax
++      ja  int_ret_from_sys_call       /* ia32_tracesys has set RAX(%rsp) */
++      jmp ia32_do_call
++END(ia32_syscall)
++
++ia32_badsys:
++      movq $0,ORIG_RAX-ARGOFFSET(%rsp)
++      movq $-ENOSYS,%rax
++      jmp ia32_sysret
++
++quiet_ni_syscall:
++      movq $-ENOSYS,%rax
++      ret
++      CFI_ENDPROC
++      
++      .macro PTREGSCALL label, func, arg
++      .globl \label
++\label:
++      leaq \func(%rip),%rax
++      leaq -ARGOFFSET+8(%rsp),\arg    /* 8 for return address */
++      jmp  ia32_ptregs_common 
++      .endm
++
++      CFI_STARTPROC32
++
++      PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
++      PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
++      PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
++      PTREGSCALL stub32_execve, sys32_execve, %rcx
++      PTREGSCALL stub32_fork, sys_fork, %rdi
++      PTREGSCALL stub32_clone, sys32_clone, %rdx
++      PTREGSCALL stub32_vfork, sys_vfork, %rdi
++      PTREGSCALL stub32_iopl, sys_iopl, %rsi
++
++ENTRY(ia32_ptregs_common)
++      popq %r11
++      CFI_ENDPROC
++      CFI_STARTPROC32 simple
++      CFI_SIGNAL_FRAME
++      CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
++      CFI_REL_OFFSET  rax,RAX-ARGOFFSET
++      CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
++      CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
++      CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
++      CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
++      CFI_REL_OFFSET  rip,RIP-ARGOFFSET
++/*    CFI_REL_OFFSET  cs,CS-ARGOFFSET*/
++/*    CFI_REL_OFFSET  rflags,EFLAGS-ARGOFFSET*/
++      CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
++/*    CFI_REL_OFFSET  ss,SS-ARGOFFSET*/
++      SAVE_REST
++      call *%rax
++      RESTORE_REST
++      jmp  ia32_sysret        /* misbalances the return cache */
++      CFI_ENDPROC
++END(ia32_ptregs_common)
++
++      .section .rodata,"a"
++      .align 8
++ia32_sys_call_table:
++      .quad sys_restart_syscall
++      .quad sys_exit
++      .quad stub32_fork
++      .quad sys_read
++      .quad sys_write
++      .quad compat_sys_open           /* 5 */
++      .quad sys_close
++      .quad sys32_waitpid
++      .quad sys_creat
++      .quad sys_link
++      .quad sys_unlink                /* 10 */
++      .quad stub32_execve
++      .quad sys_chdir
++      .quad compat_sys_time
++      .quad sys_mknod
++      .quad sys_chmod         /* 15 */
++      .quad sys_lchown16
++      .quad quiet_ni_syscall                  /* old break syscall holder */
++      .quad sys_stat
++      .quad sys32_lseek
++      .quad sys_getpid                /* 20 */
++      .quad compat_sys_mount  /* mount  */
++      .quad sys_oldumount     /* old_umount  */
++      .quad sys_setuid16
++      .quad sys_getuid16
++      .quad compat_sys_stime  /* stime */             /* 25 */
++      .quad compat_sys_ptrace /* ptrace */
++      .quad sys_alarm
++      .quad sys_fstat /* (old)fstat */
++      .quad sys_pause
++      .quad compat_sys_utime  /* 30 */
++      .quad quiet_ni_syscall  /* old stty syscall holder */
++      .quad quiet_ni_syscall  /* old gtty syscall holder */
++      .quad sys_access
++      .quad sys_nice  
++      .quad quiet_ni_syscall  /* 35 */        /* old ftime syscall holder */
++      .quad sys_sync
++      .quad sys32_kill
++      .quad sys_rename
++      .quad sys_mkdir
++      .quad sys_rmdir         /* 40 */
++      .quad sys_dup
++      .quad sys_pipe
++      .quad compat_sys_times
++      .quad quiet_ni_syscall                  /* old prof syscall holder */
++      .quad sys_brk           /* 45 */
++      .quad sys_setgid16
++      .quad sys_getgid16
++      .quad sys_signal
++      .quad sys_geteuid16
++      .quad sys_getegid16     /* 50 */
++      .quad sys_acct
++      .quad sys_umount                        /* new_umount */
++      .quad quiet_ni_syscall                  /* old lock syscall holder */
++      .quad compat_sys_ioctl
++      .quad compat_sys_fcntl64                /* 55 */
++      .quad quiet_ni_syscall                  /* old mpx syscall holder */
++      .quad sys_setpgid
++      .quad quiet_ni_syscall                  /* old ulimit syscall holder */
++      .quad sys_olduname
++      .quad sys_umask         /* 60 */
++      .quad sys_chroot
++      .quad compat_sys_ustat
++      .quad sys_dup2
++      .quad sys_getppid
++      .quad sys_getpgrp               /* 65 */
++      .quad sys_setsid
++      .quad sys32_sigaction
++      .quad sys_sgetmask
++      .quad sys_ssetmask
++      .quad sys_setreuid16    /* 70 */
++      .quad sys_setregid16
++      .quad sys32_sigsuspend
++      .quad compat_sys_sigpending
++      .quad sys_sethostname
++      .quad compat_sys_setrlimit      /* 75 */
++      .quad compat_sys_old_getrlimit  /* old_getrlimit */
++      .quad compat_sys_getrusage
++      .quad compat_sys_gettimeofday
++      .quad compat_sys_settimeofday
++      .quad sys_getgroups16   /* 80 */
++      .quad sys_setgroups16
++      .quad compat_sys_old_select
++      .quad sys_symlink
++      .quad sys_lstat
++      .quad sys_readlink              /* 85 */
++      .quad sys_uselib
++      .quad sys_swapon
++      .quad sys_reboot
++      .quad compat_sys_old_readdir
++      .quad sys32_mmap                /* 90 */
++      .quad sys_munmap
++      .quad sys_truncate
++      .quad sys_ftruncate
++      .quad sys_fchmod
++      .quad sys_fchown16              /* 95 */
++      .quad sys_getpriority
++      .quad sys_setpriority
++      .quad quiet_ni_syscall                  /* old profil syscall holder */
++      .quad compat_sys_statfs
++      .quad compat_sys_fstatfs                /* 100 */
++      .quad sys_ioperm
++      .quad compat_sys_socketcall
++      .quad sys_syslog
++      .quad compat_sys_setitimer
++      .quad compat_sys_getitimer      /* 105 */
++      .quad compat_sys_newstat
++      .quad compat_sys_newlstat
++      .quad compat_sys_newfstat
++      .quad sys_uname
++      .quad stub32_iopl               /* 110 */
++      .quad sys_vhangup
++      .quad quiet_ni_syscall  /* old "idle" system call */
++      .quad sys32_vm86_warning        /* vm86old */ 
++      .quad compat_sys_wait4
++      .quad sys_swapoff               /* 115 */
++      .quad compat_sys_sysinfo
++      .quad sys32_ipc
++      .quad sys_fsync
++      .quad stub32_sigreturn
++      .quad stub32_clone              /* 120 */
++      .quad sys_setdomainname
++      .quad sys_newuname
++      .quad sys_modify_ldt
++      .quad compat_sys_adjtimex
++      .quad sys32_mprotect            /* 125 */
++      .quad compat_sys_sigprocmask
++      .quad quiet_ni_syscall          /* create_module */
++      .quad sys_init_module
++      .quad sys_delete_module
++      .quad quiet_ni_syscall          /* 130  get_kernel_syms */
++      .quad sys32_quotactl
++      .quad sys_getpgid
++      .quad sys_fchdir
++      .quad quiet_ni_syscall  /* bdflush */
++      .quad sys_sysfs         /* 135 */
++      .quad sys_personality
++      .quad quiet_ni_syscall  /* for afs_syscall */
++      .quad sys_setfsuid16
++      .quad sys_setfsgid16
++      .quad sys_llseek                /* 140 */
++      .quad compat_sys_getdents
++      .quad compat_sys_select
++      .quad sys_flock
++      .quad sys_msync
++      .quad compat_sys_readv          /* 145 */
++      .quad compat_sys_writev
++      .quad sys_getsid
++      .quad sys_fdatasync
++      .quad compat_sys_sysctl /* sysctl */
++      .quad sys_mlock         /* 150 */
++      .quad sys_munlock
++      .quad sys_mlockall
++      .quad sys_munlockall
++      .quad sys_sched_setparam
++      .quad sys_sched_getparam   /* 155 */
++      .quad sys_sched_setscheduler
++      .quad sys_sched_getscheduler
++      .quad sys_sched_yield
++      .quad sys_sched_get_priority_max
++      .quad sys_sched_get_priority_min  /* 160 */
++      .quad sys32_sched_rr_get_interval
++      .quad compat_sys_nanosleep
++      .quad sys_mremap
++      .quad sys_setresuid16
++      .quad sys_getresuid16   /* 165 */
++      .quad sys32_vm86_warning        /* vm86 */ 
++      .quad quiet_ni_syscall  /* query_module */
++      .quad sys_poll
++      .quad compat_sys_nfsservctl
++      .quad sys_setresgid16   /* 170 */
++      .quad sys_getresgid16
++      .quad sys_prctl
++      .quad stub32_rt_sigreturn
++      .quad sys32_rt_sigaction
++      .quad sys32_rt_sigprocmask      /* 175 */
++      .quad sys32_rt_sigpending
++      .quad compat_sys_rt_sigtimedwait
++      .quad sys32_rt_sigqueueinfo
++      .quad sys_rt_sigsuspend
++      .quad sys32_pread               /* 180 */
++      .quad sys32_pwrite
++      .quad sys_chown16
++      .quad sys_getcwd
++      .quad sys_capget
++      .quad sys_capset
++      .quad stub32_sigaltstack
++      .quad sys32_sendfile
++      .quad quiet_ni_syscall          /* streams1 */
++      .quad quiet_ni_syscall          /* streams2 */
++      .quad stub32_vfork            /* 190 */
++      .quad compat_sys_getrlimit
++      .quad sys_mmap_pgoff
++      .quad sys32_truncate64
++      .quad sys32_ftruncate64
++      .quad sys32_stat64              /* 195 */
++      .quad sys32_lstat64
++      .quad sys32_fstat64
++      .quad sys_lchown
++      .quad sys_getuid
++      .quad sys_getgid                /* 200 */
++      .quad sys_geteuid
++      .quad sys_getegid
++      .quad sys_setreuid
++      .quad sys_setregid
++      .quad sys_getgroups     /* 205 */
++      .quad sys_setgroups
++      .quad sys_fchown
++      .quad sys_setresuid
++      .quad sys_getresuid
++      .quad sys_setresgid     /* 210 */
++      .quad sys_getresgid
++      .quad sys_chown
++      .quad sys_setuid
++      .quad sys_setgid
++      .quad sys_setfsuid              /* 215 */
++      .quad sys_setfsgid
++      .quad sys_pivot_root
++      .quad sys_mincore
++      .quad sys_madvise
++      .quad compat_sys_getdents64     /* 220 getdents64 */
++      .quad compat_sys_fcntl64        
++      .quad quiet_ni_syscall          /* tux */
++      .quad quiet_ni_syscall          /* security */
++      .quad sys_gettid        
++      .quad sys32_readahead   /* 225 */
++      .quad sys_setxattr
++      .quad sys_lsetxattr
++      .quad sys_fsetxattr
++      .quad sys_getxattr
++      .quad sys_lgetxattr     /* 230 */
++      .quad sys_fgetxattr
++      .quad sys_listxattr
++      .quad sys_llistxattr
++      .quad sys_flistxattr
++      .quad sys_removexattr   /* 235 */
++      .quad sys_lremovexattr
++      .quad sys_fremovexattr
++      .quad sys_tkill
++      .quad sys_sendfile64 
++      .quad compat_sys_futex          /* 240 */
++      .quad compat_sys_sched_setaffinity
++      .quad compat_sys_sched_getaffinity
++      .quad sys_set_thread_area
++      .quad sys_get_thread_area
++      .quad compat_sys_io_setup       /* 245 */
++      .quad sys_io_destroy
++      .quad compat_sys_io_getevents
++      .quad compat_sys_io_submit
++      .quad sys_io_cancel
++      .quad sys32_fadvise64           /* 250 */
++      .quad quiet_ni_syscall  /* free_huge_pages */
++      .quad sys_exit_group
++      .quad sys32_lookup_dcookie
++      .quad sys_epoll_create
++      .quad sys_epoll_ctl             /* 255 */
++      .quad sys_epoll_wait
++      .quad sys_remap_file_pages
++      .quad sys_set_tid_address
++      .quad compat_sys_timer_create
++      .quad compat_sys_timer_settime  /* 260 */
++      .quad compat_sys_timer_gettime
++      .quad sys_timer_getoverrun
++      .quad sys_timer_delete
++      .quad compat_sys_clock_settime
++      .quad compat_sys_clock_gettime  /* 265 */
++      .quad compat_sys_clock_getres
++      .quad compat_sys_clock_nanosleep
++      .quad compat_sys_statfs64
++      .quad compat_sys_fstatfs64
++      .quad sys_tgkill                /* 270 */
++      .quad compat_sys_utimes
++      .quad sys32_fadvise64_64
++      .quad quiet_ni_syscall  /* sys_vserver */
++      .quad sys_mbind
++      .quad compat_sys_get_mempolicy  /* 275 */
++      .quad sys_set_mempolicy
++      .quad compat_sys_mq_open
++      .quad sys_mq_unlink
++      .quad compat_sys_mq_timedsend
++      .quad compat_sys_mq_timedreceive        /* 280 */
++      .quad compat_sys_mq_notify
++      .quad compat_sys_mq_getsetattr
++      .quad compat_sys_kexec_load     /* reserved for kexec */
++      .quad compat_sys_waitid
++      .quad quiet_ni_syscall          /* 285: sys_altroot */
++      .quad sys_add_key
++      .quad sys_request_key
++      .quad sys_keyctl
++      .quad sys_ioprio_set
++      .quad sys_ioprio_get            /* 290 */
++      .quad sys_inotify_init
++      .quad sys_inotify_add_watch
++      .quad sys_inotify_rm_watch
++      .quad sys_migrate_pages
++      .quad compat_sys_openat         /* 295 */
++      .quad sys_mkdirat
++      .quad sys_mknodat
++      .quad sys_fchownat
++      .quad compat_sys_futimesat
++      .quad sys32_fstatat             /* 300 */
++      .quad sys_unlinkat
++      .quad sys_renameat
++      .quad sys_linkat
++      .quad sys_symlinkat
++      .quad sys_readlinkat            /* 305 */
++      .quad sys_fchmodat
++      .quad sys_faccessat
++      .quad compat_sys_pselect6
++      .quad compat_sys_ppoll
++      .quad sys_unshare               /* 310 */
++      .quad compat_sys_set_robust_list
++      .quad compat_sys_get_robust_list
++      .quad sys_splice
++      .quad sys32_sync_file_range
++      .quad sys_tee                   /* 315 */
++      .quad compat_sys_vmsplice
++      .quad compat_sys_move_pages
++      .quad sys_getcpu
++      .quad sys_epoll_pwait
++      .quad compat_sys_utimensat      /* 320 */
++      .quad compat_sys_signalfd
++      .quad sys_timerfd_create
++      .quad sys_eventfd
++      .quad sys32_fallocate
++      .quad compat_sys_timerfd_settime        /* 325 */
++      .quad compat_sys_timerfd_gettime
++      .quad compat_sys_signalfd4
++      .quad sys_eventfd2
++      .quad sys_epoll_create1
++      .quad sys_dup3                          /* 330 */
++      .quad sys_pipe2
++      .quad sys_inotify_init1
++      .quad compat_sys_preadv
++      .quad compat_sys_pwritev
++      .quad compat_sys_rt_tgsigqueueinfo      /* 335 */
++      .quad sys_perf_event_open
++      .quad compat_sys_recvmmsg
++      .quad sys_fanotify_init
++      .quad sys32_fanotify_mark
++      .quad sys_prlimit64             /* 340 */
++      .quad sys_name_to_handle_at
++      .quad compat_sys_open_by_handle_at
++      .quad compat_sys_clock_adjtime
++      .quad sys_syncfs
++ia32_syscall_end:
diff --cc arch/x86/include/asm/acpi.h

index 610001d,12e0e7d..6de2979
--- 1/arch/x86/include/asm/acpi.h
--- 2/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@@ -31,6 -31,6 +31,10 @@@
   #include <asm/mpspec.h>
   #include <asm/trampoline.h>
   
++#ifdef CONFIG_XEN
++#include <xen/interface/platform.h>
++#endif
++
   #define COMPILER_DEPENDENT_INT64   long long
   #define COMPILER_DEPENDENT_UINT64  unsigned long long
   
@@@ -115,7 -115,7 +119,11 @@@ static inline void acpi_disable_pci(voi
   }
   
   /* Low-level suspend routine. */
++#ifdef CONFIG_ACPI_PV_SLEEP
++#define acpi_suspend_lowlevel() acpi_enter_sleep_state(ACPI_STATE_S3)
++#else
   extern int acpi_suspend_lowlevel(void);
++#endif
   
   extern const unsigned char acpi_wakeup_code[];
   #define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code)))
@@@ -123,11 -123,11 +131,33 @@@
   /* early initialization routine */
   extern void acpi_reserve_wakeup_memory(void);
   
++#ifdef CONFIG_XEN
++static inline int acpi_notify_hypervisor_state(u8 sleep_state,
++                                             u32 pm1a_cnt_val,
++                                             u32 pm1b_cnt_val)
++{
++      struct xen_platform_op op = {
++              .cmd = XENPF_enter_acpi_sleep,
++              .interface_version = XENPF_INTERFACE_VERSION,
++              .u = {
++                      .enter_acpi_sleep = {
++                              .pm1a_cnt_val = pm1a_cnt_val,
++                              .pm1b_cnt_val = pm1b_cnt_val,
++                              .sleep_state = sleep_state,
++                      },
++              },
++      };
++
++      return HYPERVISOR_platform_op(&op);
++}
++#endif /* CONFIG_XEN */
++
   /*
    * Check if the CPU can handle C2 and deeper
    */
   static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
   {
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         /*
          * Early models (<=5) of AMD Opterons are not supposed to go into
          * C2 state.
@@@ -139,9 -139,9 +169,10 @@@
             boot_cpu_data.x86_model <= 0x05 &&
             boot_cpu_data.x86_mask < 0x0A)
                 return 1;
-       else if (amd_e400_c1e_detected)
+       else if (c1e_detected)
                 return 1;
         else
++#endif
                 return max_cstate;
   }
   
@@@ -181,7 -181,9 +212,11 @@@ static inline void disable_acpi(void) 
   
   #endif /* !CONFIG_ACPI */
   
++#ifndef CONFIG_XEN
   #define ARCH_HAS_POWER_INIT   1
++#endif
+ 
+ struct bootnode;
   
   #ifdef CONFIG_ACPI_NUMA
   extern int acpi_numa;
diff --cc arch/x86/include/asm/agp.h

index eec2a70,eec2a70..91e72c0
--- 1/arch/x86/include/asm/agp.h
--- 2/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@@ -15,6 -15,6 +15,9 @@@
   #define map_page_into_agp(page) set_pages_uc(page, 1)
   #define unmap_page_from_agp(page) set_pages_wb(page, 1)
   
++#define map_pages_into_agp set_pages_array_uc
++#define unmap_pages_from_agp set_pages_array_wb
++
   /*
    * Could use CLFLUSH here if the cpu supports it. But then it would
    * need to be called for each cacheline of the whole page so it may
diff --cc arch/x86/include/asm/apic.h

index 4a0b7c7,2b7d573..248cb8f
--- 1/arch/x86/include/asm/apic.h
--- 2/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@@ -9,12 -9,12 +9,16 @@@
   #include <asm/processor.h>
   #include <asm/apicdef.h>
   #include <asm/atomic.h>
++#ifndef CONFIG_XEN
   #include <asm/fixmap.h>
++#endif
   #include <asm/mpspec.h>
   #include <asm/system.h>
   #include <asm/msr.h>
   
++#ifndef CONFIG_XEN
   #define ARCH_APICTIMER_STOPS_ON_C3    1
++#endif
   
   /*
    * Debugging macros
@@@ -46,6 -46,6 +50,7 @@@ static inline void generic_apic_probe(v
   #ifdef CONFIG_X86_LOCAL_APIC
   
   extern unsigned int apic_verbosity;
++#ifndef CONFIG_XEN
   extern int local_apic_timer_c2_ok;
   
   extern int disable_apic;
@@@ -118,6 -118,6 +123,8 @@@ extern u64 native_apic_icr_read(void)
   
   extern int x2apic_mode;
   
++#endif /* CONFIG_XEN */
++
   #ifdef CONFIG_X86_X2APIC
   /*
    * Make previous memory operations globally visible before
@@@ -232,7 -232,7 +239,11 @@@ extern void setup_local_APIC(void)
   extern void end_local_APIC_setup(void);
   extern void bsp_end_local_APIC_setup(void);
   extern void init_apic_mappings(void);
++#ifndef CONFIG_XEN
   void register_lapic_address(unsigned long address);
++#else
++#define register_lapic_address(address)
++#endif
   extern void setup_boot_APIC_clock(void);
   extern void setup_secondary_APIC_clock(void);
   extern int APIC_init_uniprocessor(void);
@@@ -280,15 -280,15 +291,18 @@@ static inline void disable_local_APIC(v
   struct apic {
         char *name;
   
++#ifndef CONFIG_XEN
         int (*probe)(void);
         int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
         int (*apic_id_registered)(void);
++#endif
   
         u32 irq_delivery_mode;
         u32 irq_dest_mode;
   
         const struct cpumask *(*target_cpus)(void);
   
++#ifndef CONFIG_XEN
         int disable_esr;
   
         int dest_logical;
@@@ -307,8 -307,8 +321,10 @@@
         void (*setup_portio_remap)(void);
         int (*check_phys_apicid_present)(int phys_apicid);
         void (*enable_apic_mode)(void);
++#endif
         int (*phys_pkg_id)(int cpuid_apic, int index_msb);
   
++#ifndef CONFIG_XEN
         /*
          * When one of the next two hooks returns 1 the apic
          * is switched to this. Essentially they are additional
@@@ -323,6 -323,6 +339,7 @@@
         unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
         unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
                                                const struct cpumask *andmask);
++#endif
   
         /* ipi */
         void (*send_IPI_mask)(const struct cpumask *mask, int vector);
@@@ -332,6 -332,6 +349,7 @@@
         void (*send_IPI_all)(int vector);
         void (*send_IPI_self)(int vector);
   
++#ifndef CONFIG_XEN
         /* wakeup_secondary_cpu */
         int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
   
@@@ -363,14 -363,9 +381,10 @@@
          */
         int (*x86_32_early_logical_apicid)(int cpu);
   
-       /*
-        * Optional method called from setup_local_APIC() after logical
-        * apicid is guaranteed to be known to initialize apicid -> node
-        * mapping if NUMA initialization hasn't done so already.  Don't
-        * add new users.
-        */
+       /* determine CPU -> NUMA node mapping */
         int (*x86_32_numa_cpu_node)(int cpu);
   #endif
++#endif /* CONFIG_XEN */
   };
   
   /*
@@@ -380,26 -375,6 +394,8 @@@
    */
   extern struct apic *apic;
   
- /*
-  * APIC drivers are probed based on how they are listed in the .apicdrivers
-  * section. So the order is important and enforced by the ordering
-  * of different apic driver files in the Makefile.
-  *
-  * For the files having two apic drivers, we use apic_drivers()
-  * to enforce the order with in them.
-  */
- #define apic_driver(sym)                                      \
-       static struct apic *__apicdrivers_##sym __used          \
-       __aligned(sizeof(struct apic *))                        \
-       __section(.apicdrivers) = { &sym }
- 
- #define apic_drivers(sym1, sym2)                                      \
-       static struct apic *__apicdrivers_##sym1##sym2[2] __used        \
-       __aligned(sizeof(struct apic *))                                \
-       __section(.apicdrivers) = { &sym1, &sym2 }
- 
- extern struct apic *__apicdrivers[], *__apicdrivers_end[];
++#ifndef CONFIG_XEN
+ +
   /*
    * APIC functionality to boot other CPUs - only used on SMP:
    */
@@@ -495,8 -475,8 +496,9 @@@ static inline void default_wait_for_ini
         return;
   }
   
- extern struct apic *generic_bigsmp_probe(void);
+ extern void generic_bigsmp_probe(void);
   
++#endif /* CONFIG_XEN */
   
   #ifdef CONFIG_X86_LOCAL_APIC
   
@@@ -513,6 -493,6 +515,8 @@@ static inline const struct cpumask *def
   #endif
   }
   
++#ifndef CONFIG_XEN
++
   DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
   
   
@@@ -619,6 -603,6 +627,8 @@@ extern int default_cpu_present_to_apici
   extern int default_check_phys_apicid_present(int phys_apicid);
   #endif
   
++#endif /* CONFIG_XEN */
++
   #endif /* CONFIG_X86_LOCAL_APIC */
   
   #endif /* _ASM_X86_APIC_H */
diff --cc arch/x86/include/asm/apicdef.h

index 34595d5,34595d5..5bbc175
--- 1/arch/x86/include/asm/apicdef.h
--- 2/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@@ -17,6 -17,6 +17,8 @@@
    */
   #define IO_APIC_SLOT_SIZE             1024
   
++#ifndef CONFIG_XEN
++
   #define       APIC_ID         0x20
   
   #define       APIC_LVR        0x30
@@@ -144,6 -144,6 +146,16 @@@
   #define APIC_BASE_MSR 0x800
   #define X2APIC_ENABLE (1UL << 10)
   
++#else /* CONFIG_XEN */
++
++enum {
++      APIC_DEST_ALLBUT = 0x1,
++      APIC_DEST_SELF,
++      APIC_DEST_ALLINC
++};
++
++#endif /* CONFIG_XEN */
++
   #ifdef CONFIG_X86_32
   # define MAX_IO_APICS 64
   # define MAX_LOCAL_APIC 256
@@@ -152,6 -152,6 +164,8 @@@
   # define MAX_LOCAL_APIC 32768
   #endif
   
++#ifndef CONFIG_XEN
++
   /*
    * All x86-64 systems are xAPIC compatible.
    * In the following, "apicid" is a physical APIC ID.
@@@ -422,6 -422,6 +436,8 @@@ struct local_apic 
   
   #undef u32
   
++#endif /* CONFIG_XEN */
++
   #ifdef CONFIG_X86_32
    #define BAD_APICID 0xFFu
   #else
diff --cc arch/x86/include/asm/boot.h

index 5e1a2ee,5e1a2ee..2d2275a
--- 1/arch/x86/include/asm/boot.h
--- 2/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@@ -16,7 -16,7 +16,7 @@@
                                 & ~(CONFIG_PHYSICAL_ALIGN - 1))
   
   /* Minimum kernel alignment, as a power of two */
--#ifdef CONFIG_X86_64
++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
   #define MIN_KERNEL_ALIGN_LG2  PMD_SHIFT
   #else
   #define MIN_KERNEL_ALIGN_LG2  (PAGE_SHIFT + THREAD_ORDER)
diff --cc arch/x86/include/asm/cpufeature.h

index 71cc380,91f3e08..1fb6f9d
--- 1/arch/x86/include/asm/cpufeature.h
--- 2/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@@ -284,7 -277,7 +277,11 @@@ extern const char * const x86_power_fla
   #define cpu_has_xmm4_1                boot_cpu_has(X86_FEATURE_XMM4_1)
   #define cpu_has_xmm4_2                boot_cpu_has(X86_FEATURE_XMM4_2)
   #define cpu_has_x2apic                boot_cpu_has(X86_FEATURE_X2APIC)
++#ifndef CONFIG_XEN
   #define cpu_has_xsave         boot_cpu_has(X86_FEATURE_XSAVE)
++#else
++#define cpu_has_xsave         boot_cpu_has(X86_FEATURE_OSXSAVE)
++#endif
   #define cpu_has_hypervisor    boot_cpu_has(X86_FEATURE_HYPERVISOR)
   #define cpu_has_pclmulqdq     boot_cpu_has(X86_FEATURE_PCLMULQDQ)
   #define cpu_has_perfctr_core  boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
diff --cc arch/x86/include/asm/e820.h

index 908b969,908b969..86ebd6b
--- 1/arch/x86/include/asm/e820.h
--- 2/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@@ -66,7 -66,7 +66,11 @@@ struct e820map 
         struct e820entry map[E820_X_MAX];
   };
   
++#ifndef CONFIG_XEN
   #define ISA_START_ADDRESS     0xa0000
++#else
++#define ISA_START_ADDRESS     0
++#endif
   #define ISA_END_ADDRESS               0x100000
   
   #define BIOS_BEGIN            0x000a0000
diff --cc arch/x86/include/asm/hardirq.h

index 55e4de6,55e4de6..53f24fb
--- 1/arch/x86/include/asm/hardirq.h
--- 2/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@@ -18,7 -18,7 +18,11 @@@ typedef struct 
   #ifdef CONFIG_SMP
         unsigned int irq_resched_count;
         unsigned int irq_call_count;
++#ifndef CONFIG_XEN
         unsigned int irq_tlb_count;
++#else
++      unsigned int irq_lock_count;
++#endif
   #endif
   #ifdef CONFIG_X86_THERMAL_VECTOR
         unsigned int irq_thermal_count;
diff --cc arch/x86/include/asm/hw_irq.h

index bb9efe8,bb9efe8..89ef4db
--- 1/arch/x86/include/asm/hw_irq.h
--- 2/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@@ -102,6 -102,6 +102,7 @@@ static inline void set_io_apic_irq_attr
         irq_attr->polarity      = polarity;
   }
   
++#ifndef CONFIG_XEN
   struct irq_2_iommu {
         struct intel_iommu *iommu;
         u16 irte_index;
@@@ -124,6 -124,6 +125,9 @@@ struct irq_cfg 
         struct irq_2_iommu      irq_2_iommu;
   #endif
   };
++#else
++struct irq_cfg;
++#endif
   
   extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
   extern void send_cleanup_vector(struct irq_cfg *);
@@@ -160,9 -160,9 +164,15 @@@ extern void smp_invalidate_interrupt(st
   #else
   extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
   #endif
++extern void smp_irq_work_interrupt(struct pt_regs *);
++#ifdef CONFIG_XEN
++extern void smp_reboot_interrupt(struct pt_regs *);
++#endif
   #endif
   
++#ifndef CONFIG_XEN
   extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
++#endif
   
   typedef int vector_irq_t[NR_VECTORS];
   DECLARE_PER_CPU(vector_irq_t, vector_irq);
diff --cc arch/x86/include/asm/hypervisor.h

index 7a15153,7a15153..2bb1d90
--- 1/arch/x86/include/asm/hypervisor.h
--- 2/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@@ -60,3 -60,3 +60,7 @@@ static inline bool hypervisor_x2apic_av
   }
   
   #endif
++
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include_next <asm/hypervisor.h>
++#endif
diff --cc arch/x86/include/asm/i8253.h

index 65aaa91,fc1f579..2cf3c9a
--- 1/arch/x86/include/asm/i8253.h
--- 2/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@@ -6,14 -6,12 +6,16 @@@
   #define PIT_CH0                       0x40
   #define PIT_CH2                       0x42
   
- #define PIT_LATCH     LATCH
- 
   extern raw_spinlock_t i8253_lock;
   
++#ifdef CONFIG_GENERIC_CLOCKEVENTS
++
   extern struct clock_event_device *global_clock_event;
   
   extern void setup_pit_timer(void);
   
++#endif
++
   #define inb_pit               inb_p
   #define outb_pit      outb_p
   
diff --cc arch/x86/include/asm/i8259.h

index a203659,a203659..b9daf61
--- 1/arch/x86/include/asm/i8259.h
--- 2/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@@ -54,6 -54,6 +54,7 @@@ extern struct irq_chip i8259A_chip
   
   struct legacy_pic {
         int nr_legacy_irqs;
++#ifndef CONFIG_XEN
         struct irq_chip *chip;
         void (*mask)(unsigned int irq);
         void (*unmask)(unsigned int irq);
@@@ -61,6 -61,6 +62,7 @@@
         void (*restore_mask)(void);
         void (*init)(int auto_eoi);
         int (*irq_pending)(unsigned int irq);
++#endif
         void (*make_irq)(unsigned int irq);
   };
   
diff --cc arch/x86/include/asm/io.h

index d02804d,0722730..b4f6d16
--- 1/arch/x86/include/asm/io.h
--- 2/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@@ -333,7 -353,7 +353,7 @@@ extern void early_iounmap(void __iomem 
   extern void fixup_early_ioremap(void);
   extern bool is_early_ioremap_ptep(pte_t *ptep);
   
--#ifdef CONFIG_XEN
++#ifdef CONFIG_PARAVIRT_XEN
   struct bio_vec;
   
   extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
@@@ -342,7 -362,7 +362,7 @@@
   #define BIOVEC_PHYS_MERGEABLE(vec1, vec2)                             \
         (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&                         \
          (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
--#endif        /* CONFIG_XEN */
++#endif        /* CONFIG_PARAVIRT_XEN */
   
   #define IO_SPACE_LIMIT 0xffff
   
diff --cc arch/x86/include/asm/kexec.h

index 317ff17,317ff17..2e587bc
--- 1/arch/x86/include/asm/kexec.h
--- 2/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@@ -5,14 -5,14 +5,30 @@@
   # define PA_CONTROL_PAGE      0
   # define VA_CONTROL_PAGE      1
   # define PA_PGD                       2
++# ifndef CONFIG_XEN
   # define PA_SWAP_PAGE         3
   # define PAGES_NR             4
++# else /* CONFIG_XEN */
++/*
++ * The hypervisor interface implicitly requires that all entries (except
++ * for possibly the final one) are arranged in matching PA_/VA_ pairs.
++#  define VA_PGD              3
++ */
++#  define PA_SWAP_PAGE                4
++#  define PAGES_NR            5
++# endif /* CONFIG_XEN */
   #else
   # define PA_CONTROL_PAGE      0
   # define VA_CONTROL_PAGE      1
   # define PA_TABLE_PAGE                2
++# ifndef CONFIG_XEN
   # define PA_SWAP_PAGE         3
   # define PAGES_NR             4
++# else /* CONFIG_XEN, see comment above
++#  define VA_TABLE_PAGE               3 */
++#  define PA_SWAP_PAGE                4
++#  define PAGES_NR            5
++# endif /* CONFIG_XEN */
   #endif
   
   # define KEXEC_CONTROL_CODE_MAX_SIZE  2048
@@@ -163,6 -163,6 +179,19 @@@ struct kimage_arch 
   };
   #endif
   
++/* Under Xen we need to work with machine addresses. These macros give the
++ * machine address of a certain page to the generic kexec code instead of
++ * the pseudo physical address which would be given by the default macros.
++ */
++
++#ifdef CONFIG_XEN
++#define KEXEC_ARCH_HAS_PAGE_MACROS
++#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
++#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
++#define kexec_virt_to_phys(addr) virt_to_machine(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
++#endif
++
   #endif /* __ASSEMBLY__ */
   
   #endif /* _ASM_X86_KEXEC_H */
diff --cc arch/x86/include/asm/mmu.h

index 5f55e69,aeff3e8..f2fc35b
--- 1/arch/x86/include/asm/mmu.h
--- 2/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@@ -11,17 -11,17 +11,20 @@@
   typedef struct {
         void *ldt;
         int size;
++#ifdef CONFIG_XEN
++      unsigned has_foreign_mappings:1;
++#endif
+       struct mutex lock;
+       void *vdso;
   
   #ifdef CONFIG_X86_64
         /* True if mm supports a task running in 32 bit compatibility mode. */
         unsigned short ia32_compat;
   #endif
   
-       struct mutex lock;
-       void *vdso;
   } mm_context_t;
   
--#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
   void leave_mm(int cpu);
   #else
   static inline void leave_mm(int cpu)
diff --cc arch/x86/include/asm/nmi.h

index 4886a68,4886a68..e45591a
--- 1/arch/x86/include/asm/nmi.h
--- 2/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@@ -17,7 -17,7 +17,10 @@@ struct ctl_table
   extern int proc_nmi_enabled(struct ctl_table *, int ,
                         void __user *, size_t *, loff_t *);
   extern int unknown_nmi_panic;
++#endif
   
++#if (defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)) || \
++    (defined(CONFIG_XEN_SMPBOOT) && CONFIG_XEN_COMPAT >= 0x030200)
   void arch_trigger_all_cpu_backtrace(void);
   #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
   #endif
diff --cc arch/x86/include/asm/page_64_types.h

index 7639dbf,7639dbf..63775df
--- 1/arch/x86/include/asm/page_64_types.h
--- 2/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@@ -69,7 -69,7 +69,15 @@@ extern void init_extra_mapping_wb(unsig
   #endif        /* !__ASSEMBLY__ */
   
   #ifdef CONFIG_FLATMEM
++/*
++ * While max_pfn is not exported, max_mapnr never gets initialized for non-Xen
++ * other than for hotplugged memory.
++ */
++#ifndef CONFIG_XEN
   #define pfn_valid(pfn)          ((pfn) < max_pfn)
++#else
++#define pfn_valid(pfn)          ((pfn) < max_mapnr)
++#endif
   #endif
   
   #endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --cc arch/x86/include/asm/percpu.h

index a0a9779,d475b43..614999c
--- 1/arch/x86/include/asm/percpu.h
--- 2/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@@ -311,6 -311,6 +311,40 @@@ do {                                                                      
         pxo_ret__;                                                      \
   })
   
++#define percpu_exchange_op(op, var, val)              \
++({                                                    \
++      typedef typeof(var) pxo_T__;                    \
++      pxo_T__ pxo_ret__;                              \
++      if (0) {                                        \
++              pxo_ret__ = (val);                      \
++              (void)pxo_ret__;                        \
++      }                                               \
++      switch (sizeof(var)) {                          \
++      case 1:                                         \
++              asm(op "b %0,"__percpu_arg(1)           \
++                  : "=q" (pxo_ret__), "+m" (var)      \
++                  : "0" ((pxo_T__)(val)));            \
++              break;                                  \
++      case 2:                                         \
++              asm(op "w %0,"__percpu_arg(1)           \
++                  : "=r" (pxo_ret__), "+m" (var)      \
++                  : "0" ((pxo_T__)(val)));            \
++              break;                                  \
++      case 4:                                         \
++              asm(op "l %0,"__percpu_arg(1)           \
++                  : "=r" (pxo_ret__), "+m" (var)      \
++                  : "0" ((pxo_T__)(val)));            \
++              break;                                  \
++      case 8:                                         \
++              asm(op "q %0,"__percpu_arg(1)           \
++                  : "=r" (pxo_ret__), "+m" (var)      \
++                  : "0" ((pxo_T__)(val)));            \
++              break;                                  \
++      default: __bad_percpu_size();                   \
++      }                                               \
++      pxo_ret__;                                      \
++})
++
   /*
    * cmpxchg has no such implied lock semantics as a result it is much
    * more efficient for cpu local operations.
@@@ -368,6 -368,6 +402,10 @@@
   #define percpu_or(var, val)           percpu_to_op("or", var, val)
   #define percpu_xor(var, val)          percpu_to_op("xor", var, val)
   #define percpu_inc(var)               percpu_unary_op("inc", var)
++#define percpu_xchg(var, val)         percpu_exchange_op("xchg", var, val)
++#ifdef CONFIG_X86_XADD
++#define percpu_xadd(var, val)         percpu_exchange_op("xadd", var, val)
++#endif
   
   #define __this_cpu_read_1(pcp)                percpu_from_op("mov", (pcp), "m"(pcp))
   #define __this_cpu_read_2(pcp)                percpu_from_op("mov", (pcp), "m"(pcp))
diff --cc arch/x86/include/asm/ptrace.h

index 94e7618,1babf8a..e2c83b4
--- 1/arch/x86/include/asm/ptrace.h
--- 2/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@@ -267,7 -275,7 +275,9 @@@ static inline unsigned long regs_get_ke
   }
   
   #define arch_has_single_step()        (1)
--#ifdef CONFIG_X86_DEBUGCTLMSR
++#if defined(CONFIG_XEN)
++#define arch_has_block_step() (0)
++#elif defined(CONFIG_X86_DEBUGCTLMSR)
   #define arch_has_block_step() (1)
   #else
   #define arch_has_block_step() (boot_cpu_data.x86 >= 6)
diff --cc arch/x86/include/asm/required-features.h

index 6c7fc25,6c7fc25..b0549bf
--- 1/arch/x86/include/asm/required-features.h
--- 2/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@@ -48,7 -48,7 +48,7 @@@
   #endif
   
   #ifdef CONFIG_X86_64
--#ifdef CONFIG_PARAVIRT
++#if defined(CONFIG_PARAVIRT) || defined(CONFIG_XEN)
   /* Paravirtualized systems may not have PSE or PGE available */
   #define NEED_PSE      0
   #define NEED_PGE      0
diff --cc arch/x86/include/asm/segment.h

index cd84f72,cd84f72..855bb68
--- 1/arch/x86/include/asm/segment.h
--- 2/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@@ -188,7 -188,7 +188,9 @@@
   #define __KERNEL_DS   (GDT_ENTRY_KERNEL_DS*8)
   #define __USER_DS     (GDT_ENTRY_DEFAULT_USER_DS*8+3)
   #define __USER_CS     (GDT_ENTRY_DEFAULT_USER_CS*8+3)
--#ifndef CONFIG_PARAVIRT
++#if defined(CONFIG_X86_XEN)
++#define get_kernel_rpl()  (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
++#elif !defined(CONFIG_PARAVIRT)
   #define get_kernel_rpl()  0
   #endif
   
diff --cc arch/x86/include/asm/stacktrace.h
Simple merge
diff --cc arch/x86/include/asm/system.h
Simple merge
diff --cc arch/x86/include/asm/thread_info.h

index 1f2e61e,1f2e61e..3807be9
--- 1/arch/x86/include/asm/thread_info.h
--- 2/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@@ -95,6 -95,6 +95,9 @@@ struct thread_info 
   #define TIF_BLOCKSTEP         25      /* set when we want DEBUGCTLMSR_BTF */
   #define TIF_LAZY_MMU_UPDATES  27      /* task is updating the mmu lazily */
   #define TIF_SYSCALL_TRACEPOINT        28      /* syscall tracepoint instrumentation */
++#ifdef CONFIG_X86_XEN
++#define TIF_CSTAR             31      /* cstar-based syscall (special handling) */
++#endif
   
   #define _TIF_SYSCALL_TRACE    (1 << TIF_SYSCALL_TRACE)
   #define _TIF_NOTIFY_RESUME    (1 << TIF_NOTIFY_RESUME)
@@@ -117,6 -117,6 +120,7 @@@
   #define _TIF_BLOCKSTEP                (1 << TIF_BLOCKSTEP)
   #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
   #define _TIF_SYSCALL_TRACEPOINT       (1 << TIF_SYSCALL_TRACEPOINT)
++#define _TIF_CSTAR            (1 << TIF_CSTAR)
   
   /* work to do in syscall_trace_enter() */
   #define _TIF_WORK_SYSCALL_ENTRY       \
@@@ -144,9 -144,9 +148,13 @@@
          _TIF_USER_RETURN_NOTIFY)
   
   /* flags to check in __switch_to() */
++#ifndef CONFIG_XEN
   #define _TIF_WORK_CTXSW                                                       \
         (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
   
++#else
++#define _TIF_WORK_CTXSW (_TIF_NOTSC /*todo | _TIF_BLOCKSTEP */)
++#endif
   #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
   #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
   
diff --cc arch/x86/include/asm/time.h

index 7bdec4e,7bdec4e..c66205b
--- 1/arch/x86/include/asm/time.h
--- 2/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@@ -7,4 -7,4 +7,11 @@@ extern void hpet_time_init(void)
   
   extern void time_init(void);
   
++#ifdef CONFIG_XEN
++struct timespec;
++extern int xen_independent_wallclock(void);
++extern void xen_read_persistent_clock(struct timespec *);
++extern int xen_update_persistent_clock(void);
++#endif
++
   #endif /* _ASM_X86_TIME_H */
diff --cc arch/x86/include/asm/topology.h

index c006924,910a708..3cd4143
--- 1/arch/x86/include/asm/topology.h
--- 2/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@@ -30,7 -30,7 +30,7 @@@
   #  define ENABLE_TOPO_DEFINES
   # endif
   #else
--# ifdef CONFIG_SMP
++# if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
   #  define ENABLE_TOPO_DEFINES
   # endif
   #endif
diff --cc arch/x86/include/asm/trampoline.h

index feca311,feca311..0a55878
--- 1/arch/x86/include/asm/trampoline.h
--- 2/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@@ -1,4 -1,4 +1,4 @@@
--#ifndef _ASM_X86_TRAMPOLINE_H
++#if !defined(_ASM_X86_TRAMPOLINE_H) && !defined(CONFIG_XEN)
   #define _ASM_X86_TRAMPOLINE_H
   
   #ifndef __ASSEMBLY__
diff --cc arch/x86/include/asm/traps.h

index 0310da6,0310da6..b3f2ff2
--- 1/arch/x86/include/asm/traps.h
--- 2/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@@ -38,6 -38,6 +38,9 @@@ asmlinkage void alignment_check(void)
   asmlinkage void machine_check(void);
   #endif /* CONFIG_X86_MCE */
   asmlinkage void simd_coprocessor_error(void);
++#ifdef CONFIG_X86_XEN
++asmlinkage void fixup_4gb_segment(void);
++#endif
   
   dotraplinkage void do_divide_error(struct pt_regs *, long);
   dotraplinkage void do_debug(struct pt_regs *, long);
@@@ -66,6 -66,6 +69,9 @@@ dotraplinkage void do_machine_check(str
   dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
   #ifdef CONFIG_X86_32
   dotraplinkage void do_iret_error(struct pt_regs *, long);
++#ifdef CONFIG_XEN
++void do_fixup_4gb_segment(struct pt_regs *, long);
++#endif
   #endif
   
   static inline int get_si_code(unsigned long condition)
diff --cc arch/x86/include/asm/uv/uv_hub.h

index f26544a,4298002..d72061a
--- 1/arch/x86/include/asm/uv/uv_hub.h
--- 2/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@@ -11,7 -11,7 +11,7 @@@
   #ifndef _ASM_X86_UV_UV_HUB_H
   #define _ASM_X86_UV_UV_HUB_H
   
--#ifdef CONFIG_X86_64
++#ifdef CONFIG_X86_UV
   #include <linux/numa.h>
   #include <linux/percpu.h>
   #include <linux/timer.h>
diff --cc arch/x86/include/asm/xen/hypervisor.h

index 66d0fff,66d0fff..41ff2bd
--- 1/arch/x86/include/asm/xen/hypervisor.h
--- 2/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@@ -58,7 -58,7 +58,7 @@@ static inline uint32_t xen_cpuid_base(v
         return 0;
   }
   
--#ifdef CONFIG_XEN
++#ifdef CONFIG_PARAVIRT_XEN
   extern bool xen_hvm_need_lapic(void);
   
   static inline bool xen_x2apic_para_available(void)
diff --cc arch/x86/include/asm/xen/interface.h

index 5d4922a,5d4922a..40c95d2
--- 1/arch/x86/include/asm/xen/interface.h
--- 2/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@@ -10,17 -10,17 +10,20 @@@
   #define _ASM_X86_XEN_INTERFACE_H
   
   #ifdef __XEN__
--#define __DEFINE_GUEST_HANDLE(name, type) \
++#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
       typedef struct { type *p; } __guest_handle_ ## name
   #else
--#define __DEFINE_GUEST_HANDLE(name, type) \
++#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
       typedef type * __guest_handle_ ## name
   #endif
   
++#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
++    ___DEFINE_XEN_GUEST_HANDLE(name, type);   \
++    ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type)
   #define DEFINE_GUEST_HANDLE_STRUCT(name) \
--      __DEFINE_GUEST_HANDLE(name, struct name)
--#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
--#define GUEST_HANDLE(name)        __guest_handle_ ## name
++      __DEFINE_XEN_GUEST_HANDLE(name, struct name)
++#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
++#define XEN_GUEST_HANDLE(name)        __guest_handle_ ## name
   
   #ifdef __XEN__
   #if defined(__i386__)
@@@ -47,14 -47,14 +50,8 @@@
   #endif
   
   #ifndef __ASSEMBLY__
--/* Guest handles for primitive C types. */
--__DEFINE_GUEST_HANDLE(uchar, unsigned char);
--__DEFINE_GUEST_HANDLE(uint,  unsigned int);
--__DEFINE_GUEST_HANDLE(ulong, unsigned long);
--DEFINE_GUEST_HANDLE(char);
--DEFINE_GUEST_HANDLE(int);
--DEFINE_GUEST_HANDLE(long);
--DEFINE_GUEST_HANDLE(void);
++typedef unsigned long xen_pfn_t;
++typedef unsigned long xen_ulong_t;
   #endif
   
   #ifndef HYPERVISOR_VIRT_START
@@@ -66,7 -66,7 +63,7 @@@
   #define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
   
   /* Maximum number of virtual CPUs in multi-processor guests. */
--#define MAX_VIRT_CPUS 32
++#define XEN_LEGACY_MAX_VCPUS 32
   
   /*
    * SEGMENT DESCRIPTOR TABLES
diff --cc arch/x86/include/mach-xen/asm/agp.h

index 0000000,0000000..e2a122d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/agp.h
@@@ -1,0 -1,0 +1,58 @@@
++#ifndef _ASM_X86_AGP_H
++#define _ASM_X86_AGP_H
++
++#include <asm/pgtable.h>
++#include <asm/cacheflush.h>
++#include <asm/system.h>
++
++/*
++ * Functions to keep the agpgart mappings coherent with the MMU. The
++ * GART gives the CPU a physical alias of pages in memory. The alias
++ * region is mapped uncacheable. Make sure there are no conflicting
++ * mappings with different cachability attributes for the same
++ * page. This avoids data corruption on some CPUs.
++ */
++
++#define map_page_into_agp(page) ( \
++      xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
++      ?: set_pages_uc(page, 1))
++#define unmap_page_from_agp(page) ( \
++      xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
++      /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
++      set_pages_wb(page, 1))
++
++#define map_pages_into_agp(pages, nr) ({ \
++      __typeof__(nr) n__; \
++      int rc__ = 0; \
++      for (n__ = 0; n__ < (nr) && !rc__; ++n__) \
++              rc__ = xen_create_contiguous_region( \
++                      (unsigned long)page_address((pages)[n__]), 0, 32); \
++      rc__ ?: set_pages_array_uc(pages, nr); \
++})
++#define unmap_pages_from_agp(pages, nr) ({ \
++      __typeof__(nr) n__; \
++      for (n__ = 0; n__ < nr; ++n__) \
++              xen_destroy_contiguous_region( \
++                      (unsigned long)page_address((pages)[n__]), 0); \
++      /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
++      set_pages_array_wb(pages, nr); \
++})
++
++/*
++ * Could use CLFLUSH here if the cpu supports it. But then it would
++ * need to be called for each cacheline of the whole page so it may
++ * not be worth it. Would need a page for it.
++ */
++#define flush_agp_cache() wbinvd()
++
++#define virt_to_gart virt_to_machine
++
++/* GATT allocation. Returns/accepts GATT kernel virtual address. */
++#define alloc_gatt_pages(order)       ({                                          \
++      char *_t; dma_addr_t _d;                                            \
++      _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL);    \
++      _t; })
++#define free_gatt_pages(table, order) \
++      dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
++
++#endif /* _ASM_X86_AGP_H */
diff --cc arch/x86/include/mach-xen/asm/desc.h

index 0000000,0000000..2bdb723

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/desc.h
@@@ -1,0 -1,0 +1,421 @@@
++#ifndef _ASM_X86_DESC_H
++#define _ASM_X86_DESC_H
++
++#include <asm/desc_defs.h>
++#include <asm/ldt.h>
++#include <asm/mmu.h>
++#include <linux/smp.h>
++
++static inline void fill_ldt(struct desc_struct *desc,
++                          const struct user_desc *info)
++{
++      desc->limit0 = info->limit & 0x0ffff;
++      desc->base0 = info->base_addr & 0x0000ffff;
++
++      desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
++      desc->type = (info->read_exec_only ^ 1) << 1;
++      desc->type |= info->contents << 2;
++      desc->s = 1;
++      desc->dpl = 0x3;
++      desc->p = info->seg_not_present ^ 1;
++      desc->limit = (info->limit & 0xf0000) >> 16;
++      desc->avl = info->useable;
++      desc->d = info->seg_32bit;
++      desc->g = info->limit_in_pages;
++      desc->base2 = (info->base_addr & 0xff000000) >> 24;
++      /*
++       * Don't allow setting of the lm bit. It is useless anyway
++       * because 64bit system calls require __USER_CS:
++       */
++      desc->l = 0;
++}
++
++#ifndef CONFIG_X86_NO_IDT
++extern struct desc_ptr idt_descr;
++extern gate_desc idt_table[];
++#endif
++
++struct gdt_page {
++      struct desc_struct gdt[GDT_ENTRIES];
++} __attribute__((aligned(PAGE_SIZE)));
++DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
++
++static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
++{
++      return per_cpu(gdt_page, cpu).gdt;
++}
++
++#ifdef CONFIG_X86_64
++
++static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
++                           unsigned dpl, unsigned ist, unsigned seg)
++{
++      gate->offset_low = PTR_LOW(func);
++      gate->segment = __KERNEL_CS;
++      gate->ist = ist;
++      gate->p = 1;
++      gate->dpl = dpl;
++      gate->zero0 = 0;
++      gate->zero1 = 0;
++      gate->type = type;
++      gate->offset_middle = PTR_MIDDLE(func);
++      gate->offset_high = PTR_HIGH(func);
++}
++
++#else
++static inline void pack_gate(gate_desc *gate, unsigned char type,
++                           unsigned long base, unsigned dpl, unsigned flags,
++                           unsigned short seg)
++{
++      gate->a = (seg << 16) | (base & 0xffff);
++      gate->b = (base & 0xffff0000) |
++                (((0x80 | type | (dpl << 5)) & 0xff) << 8);
++}
++
++#endif
++
++static inline int desc_empty(const void *ptr)
++{
++      const u32 *desc = ptr;
++      return !(desc[0] | desc[1]);
++}
++
++#ifndef CONFIG_XEN
++#define load_TR_desc() native_load_tr_desc()
++#define load_gdt(dtr) native_load_gdt(dtr)
++#define load_idt(dtr) native_load_idt(dtr)
++#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
++#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
++
++#define store_gdt(dtr) native_store_gdt(dtr)
++#define store_idt(dtr) native_store_idt(dtr)
++#define store_tr(tr) (tr = native_store_tr())
++
++#define load_TLS(t, cpu) native_load_tls(t, cpu)
++#define set_ldt native_set_ldt
++
++#define write_ldt_entry(dt, entry, desc)      \
++      native_write_ldt_entry(dt, entry, desc)
++#define write_gdt_entry(dt, entry, desc, type)                \
++      native_write_gdt_entry(dt, entry, desc, type)
++#define write_idt_entry(dt, entry, g)         \
++      native_write_idt_entry(dt, entry, g)
++
++static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
++{
++}
++
++static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
++{
++}
++
++#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
++
++static inline void native_write_idt_entry(gate_desc *idt, int entry,
++                                        const gate_desc *gate)
++{
++      memcpy(&idt[entry], gate, sizeof(*gate));
++}
++
++static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
++                                        const void *desc)
++{
++      memcpy(&ldt[entry], desc, 8);
++}
++
++static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
++                                        const void *desc, int type)
++{
++      unsigned int size;
++      switch (type) {
++      case DESC_TSS:
++              size = sizeof(tss_desc);
++              break;
++      case DESC_LDT:
++              size = sizeof(ldt_desc);
++              break;
++      default:
++              size = sizeof(struct desc_struct);
++              break;
++      }
++      memcpy(&gdt[entry], desc, size);
++}
++#endif
++
++static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
++                                 unsigned long limit, unsigned char type,
++                                 unsigned char flags)
++{
++      desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
++      desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
++              (limit & 0x000f0000) | ((type & 0xff) << 8) |
++              ((flags & 0xf) << 20);
++      desc->p = 1;
++}
++
++
++#ifndef CONFIG_XEN
++static inline void set_tssldt_descriptor(void *d, unsigned long addr,
++                                       unsigned type, unsigned size)
++{
++#ifdef CONFIG_X86_64
++      struct ldttss_desc64 *desc = d;
++      memset(desc, 0, sizeof(*desc));
++      desc->limit0 = size & 0xFFFF;
++      desc->base0 = PTR_LOW(addr);
++      desc->base1 = PTR_MIDDLE(addr) & 0xFF;
++      desc->type = type;
++      desc->p = 1;
++      desc->limit1 = (size >> 16) & 0xF;
++      desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
++      desc->base3 = PTR_HIGH(addr);
++#else
++      pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
++#endif
++}
++
++static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
++{
++      struct desc_struct *d = get_cpu_gdt_table(cpu);
++      tss_desc tss;
++
++      /*
++       * sizeof(unsigned long) coming from an extra "long" at the end
++       * of the iobitmap. See tss_struct definition in processor.h
++       *
++       * -1? seg base+limit should be pointing to the address of the
++       * last valid byte
++       */
++      set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
++                            IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
++                            sizeof(unsigned long) - 1);
++      write_gdt_entry(d, entry, &tss, DESC_TSS);
++}
++
++#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
++
++static inline void native_set_ldt(const void *addr, unsigned int entries)
++{
++      if (likely(entries == 0))
++              asm volatile("lldt %w0"::"q" (0));
++      else {
++              unsigned cpu = smp_processor_id();
++              ldt_desc ldt;
++
++              set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
++                                    entries * LDT_ENTRY_SIZE - 1);
++              write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
++                              &ldt, DESC_LDT);
++              asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
++      }
++}
++
++static inline void native_load_tr_desc(void)
++{
++      asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
++}
++
++static inline void native_load_gdt(const struct desc_ptr *dtr)
++{
++      asm volatile("lgdt %0"::"m" (*dtr));
++}
++
++static inline void native_load_idt(const struct desc_ptr *dtr)
++{
++      asm volatile("lidt %0"::"m" (*dtr));
++}
++
++static inline void native_store_gdt(struct desc_ptr *dtr)
++{
++      asm volatile("sgdt %0":"=m" (*dtr));
++}
++
++static inline void native_store_idt(struct desc_ptr *dtr)
++{
++      asm volatile("sidt %0":"=m" (*dtr));
++}
++
++static inline unsigned long native_store_tr(void)
++{
++      unsigned long tr;
++      asm volatile("str %0":"=r" (tr));
++      return tr;
++}
++
++static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
++{
++      unsigned int i;
++      struct desc_struct *gdt = get_cpu_gdt_table(cpu);
++
++      for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
++              gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
++}
++#else
++#include <asm/pgtable.h>
++
++#define load_TLS(t, cpu) xen_load_tls(t, cpu)
++#define set_ldt xen_set_ldt
++
++extern int write_ldt_entry(struct desc_struct *ldt, int entry,
++                         const void *desc);
++extern int write_gdt_entry(struct desc_struct *gdt, int entry,
++                         const void *desc, int type);
++
++static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
++{
++      unsigned int i;
++      struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
++
++      for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
++              if (HYPERVISOR_update_descriptor(
++                              arbitrary_virt_to_machine(&gdt[i]),
++                              *(u64 *)&t->tls_array[i]))
++                      BUG();
++}
++#endif
++
++#define _LDT_empty(info)                              \
++      ((info)->base_addr              == 0    &&      \
++       (info)->limit                  == 0    &&      \
++       (info)->contents               == 0    &&      \
++       (info)->read_exec_only         == 1    &&      \
++       (info)->seg_32bit              == 0    &&      \
++       (info)->limit_in_pages         == 0    &&      \
++       (info)->seg_not_present        == 1    &&      \
++       (info)->useable                == 0)
++
++#ifdef CONFIG_X86_64
++#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
++#else
++#define LDT_empty(info) (_LDT_empty(info))
++#endif
++
++static inline void clear_LDT(void)
++{
++      set_ldt(NULL, 0);
++}
++
++/*
++ * load one particular LDT into the current CPU
++ */
++static inline void load_LDT_nolock(mm_context_t *pc)
++{
++      set_ldt(pc->ldt, pc->size);
++}
++
++static inline void load_LDT(mm_context_t *pc)
++{
++      preempt_disable();
++      load_LDT_nolock(pc);
++      preempt_enable();
++}
++
++static inline unsigned long get_desc_base(const struct desc_struct *desc)
++{
++      return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
++}
++
++static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
++{
++      desc->base0 = base & 0xffff;
++      desc->base1 = (base >> 16) & 0xff;
++      desc->base2 = (base >> 24) & 0xff;
++}
++
++static inline unsigned long get_desc_limit(const struct desc_struct *desc)
++{
++      return desc->limit0 | (desc->limit << 16);
++}
++
++static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
++{
++      desc->limit0 = limit & 0xffff;
++      desc->limit = (limit >> 16) & 0xf;
++}
++
++#ifndef CONFIG_X86_NO_IDT
++static inline void _set_gate(int gate, unsigned type, void *addr,
++                           unsigned dpl, unsigned ist, unsigned seg)
++{
++      gate_desc s;
++      pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
++      /*
++       * does not need to be atomic because it is only done once at
++       * setup time
++       */
++      write_idt_entry(idt_table, gate, &s);
++}
++
++/*
++ * This needs to use 'idt_table' rather than 'idt', and
++ * thus use the _nonmapped_ version of the IDT, as the
++ * Pentium F0 0F bugfix can have resulted in the mapped
++ * IDT being write-protected.
++ */
++static inline void set_intr_gate(unsigned int n, void *addr)
++{
++      BUG_ON((unsigned)n > 0xFF);
++      _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
++}
++
++extern int first_system_vector;
++/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
++extern unsigned long used_vectors[];
++
++static inline void alloc_system_vector(int vector)
++{
++      if (!test_bit(vector, used_vectors)) {
++              set_bit(vector, used_vectors);
++              if (first_system_vector > vector)
++                      first_system_vector = vector;
++      } else
++              BUG();
++}
++
++static inline void alloc_intr_gate(unsigned int n, void *addr)
++{
++      alloc_system_vector(n);
++      set_intr_gate(n, addr);
++}
++
++/*
++ * This routine sets up an interrupt gate at directory privilege level 3.
++ */
++static inline void set_system_intr_gate(unsigned int n, void *addr)
++{
++      BUG_ON((unsigned)n > 0xFF);
++      _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
++}
++
++static inline void set_system_trap_gate(unsigned int n, void *addr)
++{
++      BUG_ON((unsigned)n > 0xFF);
++      _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
++}
++
++static inline void set_trap_gate(unsigned int n, void *addr)
++{
++      BUG_ON((unsigned)n > 0xFF);
++      _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
++}
++
++static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
++{
++      BUG_ON((unsigned)n > 0xFF);
++      _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
++}
++
++static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
++{
++      BUG_ON((unsigned)n > 0xFF);
++      _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
++}
++
++static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
++{
++      BUG_ON((unsigned)n > 0xFF);
++      _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
++}
++#endif
++
++#endif /* _ASM_X86_DESC_H */
diff --cc arch/x86/include/mach-xen/asm/dma-mapping.h

index 0000000,0000000..5054274

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/dma-mapping.h
@@@ -1,0 -1,0 +1,25 @@@
++#ifndef _ASM_X86_DMA_MAPPING_H_
++
++#define phys_to_dma _phys_to_dma_
++#define dma_to_phys _dma_to_phys_
++
++#include_next <asm/dma-mapping.h>
++
++#undef phys_to_dma
++#undef dma_to_phys
++
++static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
++{
++      return phys_to_machine(paddr);
++}
++
++static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
++{
++      return machine_to_phys(daddr);
++}
++
++void dma_generic_free_coherent(struct device *, size_t, void *, dma_addr_t);
++
++extern int range_straddles_page_boundary(paddr_t p, size_t size);
++
++#endif /* _ASM_X86_DMA_MAPPING_H_ */
diff --cc arch/x86/include/mach-xen/asm/fixmap.h

index 0000000,0000000..1fcb0c2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/fixmap.h
@@@ -1,0 -1,0 +1,239 @@@
++/*
++ * fixmap.h: compile-time virtual memory allocation
++ *
++ * This file is subject to the terms and conditions of the GNU General Public
++ * License.  See the file "COPYING" in the main directory of this archive
++ * for more details.
++ *
++ * Copyright (C) 1998 Ingo Molnar
++ *
++ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
++ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
++ */
++
++#ifndef _ASM_X86_FIXMAP_H
++#define _ASM_X86_FIXMAP_H
++
++#ifndef __ASSEMBLY__
++#include <linux/kernel.h>
++#include <asm/acpi.h>
++#include <asm/page.h>
++#ifdef CONFIG_X86_32
++#include <linux/threads.h>
++#include <asm/kmap_types.h>
++#else
++#include <asm/vsyscall.h>
++#endif
++
++/*
++ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
++ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
++ * Because of this, FIXADDR_TOP x86 integration was left as later work.
++ */
++#ifdef CONFIG_X86_32
++/* used by vmalloc.c, vsyscall.lds.S.
++ *
++ * Leave one empty page between vmalloc'ed areas and
++ * the start of the fixmap.
++ */
++extern unsigned long __FIXADDR_TOP;
++#define FIXADDR_TOP   ((unsigned long)__FIXADDR_TOP)
++
++#define FIXADDR_USER_START     __fix_to_virt(FIX_VDSO)
++#define FIXADDR_USER_END       __fix_to_virt(FIX_VDSO - 1)
++#else
++#define FIXADDR_TOP   (VSYSCALL_END-PAGE_SIZE)
++
++/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
++#define FIXADDR_USER_START    ((unsigned long)VSYSCALL32_VSYSCALL)
++#define FIXADDR_USER_END      (FIXADDR_USER_START + PAGE_SIZE)
++#endif
++
++
++/*
++ * Here we define all the compile-time 'special' virtual
++ * addresses. The point is to have a constant address at
++ * compile time, but to set the physical address only
++ * in the boot process.
++ * for x86_32: We allocate these special addresses
++ * from the end of virtual memory (0xfffff000) backwards.
++ * Also this lets us do fail-safe vmalloc(), we
++ * can guarantee that these special addresses and
++ * vmalloc()-ed addresses never overlap.
++ *
++ * These 'compile-time allocated' memory buffers are
++ * fixed-size 4k pages (or larger if used with an increment
++ * higher than 1). Use set_fixmap(idx,phys) to associate
++ * physical memory with fixmap indices.
++ *
++ * TLB entries of such buffers will not be flushed across
++ * task switches.
++ */
++enum fixed_addresses {
++#ifdef CONFIG_X86_32
++      FIX_HOLE,
++      FIX_VDSO,
++#else
++      VSYSCALL_LAST_PAGE,
++      VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
++                          + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
++      VSYSCALL_HPET,
++#endif
++      FIX_DBGP_BASE,
++      FIX_EARLYCON_MEM_BASE,
++#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
++      FIX_OHCI1394_BASE,
++#endif
++#ifndef CONFIG_XEN
++#ifdef CONFIG_X86_LOCAL_APIC
++      FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
++#endif
++#ifdef CONFIG_X86_IO_APIC
++      FIX_IO_APIC_BASE_0,
++      FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
++#endif
++#else
++      FIX_SHARED_INFO,
++#define NR_FIX_ISAMAPS        256
++      FIX_ISAMAP_END,
++      FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
++#endif
++#ifdef CONFIG_X86_VISWS_APIC
++      FIX_CO_CPU,     /* Cobalt timer */
++      FIX_CO_APIC,    /* Cobalt APIC Redirection Table */
++      FIX_LI_PCIA,    /* Lithium PCI Bridge A */
++      FIX_LI_PCIB,    /* Lithium PCI Bridge B */
++#endif
++#ifdef CONFIG_X86_F00F_BUG
++      FIX_F00F_IDT,   /* Virtual mapping for IDT */
++#endif
++#ifdef CONFIG_X86_CYCLONE_TIMER
++      FIX_CYCLONE_TIMER, /*cyclone timer register*/
++#endif
++#ifdef CONFIG_X86_32
++      FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
++      FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
++#ifdef CONFIG_PCI_MMCONFIG
++      FIX_PCIE_MCFG,
++#endif
++#endif
++#ifdef CONFIG_PARAVIRT
++      FIX_PARAVIRT_BOOTMAP,
++#endif
++      FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
++      FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
++#ifdef        CONFIG_X86_MRST
++      FIX_LNW_VRTC,
++#endif
++      __end_of_permanent_fixed_addresses,
++
++      /*
++       * 256 temporary boot-time mappings, used by early_ioremap(),
++       * before ioremap() is functional.
++       *
++       * If necessary we round it up to the next 256 pages boundary so
++       * that we can have a single pgd entry and a single pte table:
++       */
++#define NR_FIX_BTMAPS         64
++#define FIX_BTMAPS_SLOTS      4
++#define TOTAL_FIX_BTMAPS      (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
++      FIX_BTMAP_END =
++       (__end_of_permanent_fixed_addresses ^
++        (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
++       -PTRS_PER_PTE
++       ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
++         (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
++       : __end_of_permanent_fixed_addresses,
++      FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
++#ifdef CONFIG_X86_32
++      FIX_WP_TEST,
++#endif
++#ifdef CONFIG_INTEL_TXT
++      FIX_TBOOT_BASE,
++#endif
++      __end_of_fixed_addresses
++};
++
++
++extern void reserve_top_address(unsigned long reserve);
++
++#define FIXADDR_SIZE  (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
++#define FIXADDR_BOOT_SIZE     (__end_of_fixed_addresses << PAGE_SHIFT)
++#define FIXADDR_START         (FIXADDR_TOP - FIXADDR_SIZE)
++#define FIXADDR_BOOT_START    (FIXADDR_TOP - FIXADDR_BOOT_SIZE)
++
++extern int fixmaps_set;
++
++extern pte_t *kmap_pte;
++extern pgprot_t kmap_prot;
++extern pte_t *pkmap_page_table;
++
++void xen_set_fixmap(enum fixed_addresses, phys_addr_t, pgprot_t);
++
++static inline void __set_fixmap(enum fixed_addresses idx,
++                              phys_addr_t phys, pgprot_t flags)
++{
++      xen_set_fixmap(idx, phys, flags);
++}
++
++#define set_fixmap(idx, phys)                         \
++      __set_fixmap(idx, phys, PAGE_KERNEL)
++
++/*
++ * Some hardware wants to get fixmapped without caching.
++ */
++#define set_fixmap_nocache(idx, phys)                 \
++      __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
++
++#define clear_fixmap(idx)                     \
++      __set_fixmap(idx, 0, __pgprot(0))
++
++#define __fix_to_virt(x)      (FIXADDR_TOP - ((x) << PAGE_SHIFT))
++#define __virt_to_fix(x)      ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
++
++extern void __this_fixmap_does_not_exist(void);
++
++/*
++ * 'index to address' translation. If anyone tries to use the idx
++ * directly without translation, we catch the bug with a NULL-deference
++ * kernel oops. Illegal ranges of incoming indices are caught too.
++ */
++static __always_inline unsigned long fix_to_virt(const unsigned int idx)
++{
++      /*
++       * this branch gets completely eliminated after inlining,
++       * except when someone tries to use fixaddr indices in an
++       * illegal way. (such as mixing up address types or using
++       * out-of-range indices).
++       *
++       * If it doesn't get removed, the linker will complain
++       * loudly with a reasonably clear error message..
++       */
++      if (idx >= __end_of_fixed_addresses)
++              __this_fixmap_does_not_exist();
++
++      return __fix_to_virt(idx);
++}
++
++static inline unsigned long virt_to_fix(const unsigned long vaddr)
++{
++      BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
++      return __virt_to_fix(vaddr);
++}
++
++/* Return an pointer with offset calculated */
++static __always_inline unsigned long
++__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
++{
++      __set_fixmap(idx, phys, flags);
++      return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
++}
++
++#define set_fixmap_offset(idx, phys)                  \
++      __set_fixmap_offset(idx, phys, PAGE_KERNEL)
++
++#define set_fixmap_offset_nocache(idx, phys)                  \
++      __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
++
++#endif /* !__ASSEMBLY__ */
++#endif /* _ASM_X86_FIXMAP_H */
diff --cc arch/x86/include/mach-xen/asm/gnttab_dma.h

index 0000000,0000000..fd7197c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/gnttab_dma.h
@@@ -1,0 -1,0 +1,41 @@@
++/*
++ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
++ * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp>
++ *                    VA Linux Systems Japan K.K.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ */
++
++#ifndef _ASM_I386_GNTTAB_DMA_H
++#define _ASM_I386_GNTTAB_DMA_H
++
++static inline int gnttab_dma_local_pfn(struct page *page)
++{
++      /* Has it become a local MFN? */
++      return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page))));
++}
++
++static inline maddr_t gnttab_dma_map_page(struct page *page)
++{
++      __gnttab_dma_map_page(page);
++      return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT);
++}
++
++static inline void gnttab_dma_unmap_page(maddr_t maddr)
++{
++      __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr)));
++}
++
++#endif /* _ASM_I386_GNTTAB_DMA_H */
diff --cc arch/x86/include/mach-xen/asm/highmem.h

index 0000000,0000000..1243d04

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/highmem.h
@@@ -1,0 -1,0 +1,98 @@@
++/*
++ * highmem.h: virtual kernel memory mappings for high memory
++ *
++ * Used in CONFIG_HIGHMEM systems for memory pages which
++ * are not addressable by direct kernel virtual addresses.
++ *
++ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
++ *                  Gerhard.Wichert@pdb.siemens.de
++ *
++ *
++ * Redesigned the x86 32-bit VM architecture to deal with
++ * up to 16 Terabyte physical memory. With current x86 CPUs
++ * we now support up to 64 Gigabytes physical RAM.
++ *
++ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
++ */
++
++#ifndef _ASM_X86_HIGHMEM_H
++#define _ASM_X86_HIGHMEM_H
++
++#ifdef __KERNEL__
++
++#include <linux/interrupt.h>
++#include <linux/threads.h>
++#include <asm/kmap_types.h>
++#include <asm/tlbflush.h>
++#include <asm/fixmap.h>
++
++/* declarations for highmem.c */
++extern unsigned long highstart_pfn, highend_pfn;
++
++/*
++ * Right now we initialize only a single pte table. It can be extended
++ * easily, subsequent pte tables have to be allocated in one physical
++ * chunk of RAM.
++ */
++/*
++ * Ordering is:
++ *
++ * FIXADDR_TOP
++ *                    fixed_addresses
++ * FIXADDR_START
++ *                    temp fixed addresses
++ * FIXADDR_BOOT_START
++ *                    Persistent kmap area
++ * PKMAP_BASE
++ * VMALLOC_END
++ *                    Vmalloc area
++ * VMALLOC_START
++ * high_memory
++ */
++#define LAST_PKMAP_MASK (LAST_PKMAP-1)
++#define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
++#define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
++
++extern void *kmap_high(struct page *page);
++extern void kunmap_high(struct page *page);
++
++void *kmap(struct page *page);
++void kunmap(struct page *page);
++
++void *kmap_atomic_prot(struct page *page, pgprot_t prot);
++void *__kmap_atomic(struct page *page);
++void __kunmap_atomic(void *kvaddr);
++void *kmap_atomic_pfn(unsigned long pfn);
++void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
++struct page *kmap_atomic_to_page(void *ptr);
++
++#define kmap_atomic_pte(page) \
++      kmap_atomic_prot(page, \
++                       PagePinned(page) ? PAGE_KERNEL_RO : kmap_prot)
++
++#define flush_cache_kmaps()   do { } while (0)
++
++extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
++                                      unsigned long end_pfn);
++
++void clear_highpage(struct page *);
++static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
++{
++      clear_highpage(page);
++}
++#define __HAVE_ARCH_CLEAR_HIGHPAGE
++#define clear_user_highpage clear_user_highpage
++#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE
++
++void copy_highpage(struct page *to, struct page *from);
++static inline void copy_user_highpage(struct page *to, struct page *from,
++      unsigned long vaddr, struct vm_area_struct *vma)
++{
++      copy_highpage(to, from);
++}
++#define __HAVE_ARCH_COPY_HIGHPAGE
++#define __HAVE_ARCH_COPY_USER_HIGHPAGE
++
++#endif /* __KERNEL__ */
++
++#endif /* _ASM_X86_HIGHMEM_H */
diff --cc arch/x86/include/mach-xen/asm/hypercall.h

index 0000000,0000000..25c8a45

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall.h
@@@ -1,0 -1,0 +1,430 @@@
++/******************************************************************************
++ * hypercall.h
++ *
++ * Linux-specific hypervisor handling.
++ *
++ * Copyright (c) 2002-2004, K A Fraser
++ *
++ * 64-bit updates:
++ *   Benjamin Liu <benjamin.liu@intel.com>
++ *   Jun Nakajima <jun.nakajima@intel.com>
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __HYPERCALL_H__
++#define __HYPERCALL_H__
++
++#ifndef __HYPERVISOR_H__
++# error "please don't include this file directly"
++#endif
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++# include <linux/string.h> /* memcpy() */
++#endif
++
++#ifdef CONFIG_XEN
++#define HYPERCALL_ASM_OPERAND "%c"
++#define HYPERCALL_LOCATION(op) (hypercall_page + (op) * 32)
++#define HYPERCALL_C_OPERAND(name) "i" (HYPERCALL_LOCATION(__HYPERVISOR_##name))
++#else
++#define HYPERCALL_ASM_OPERAND "*%"
++#define HYPERCALL_LOCATION(op) (hypercall_stubs + (op) * 32)
++#define HYPERCALL_C_OPERAND(name) "g" (HYPERCALL_LOCATION(__HYPERVISOR_##name))
++#endif
++
++#define HYPERCALL_ARG(arg, n) \
++      register typeof((arg)+0) __arg##n asm(HYPERCALL_arg##n) = (arg)
++
++#define _hypercall0(type, name)                                       \
++({                                                            \
++      type __res;                                             \
++      asm volatile (                                          \
++              "call " HYPERCALL_ASM_OPERAND "1"               \
++              : "=a" (__res)                                  \
++              : HYPERCALL_C_OPERAND(name)                     \
++              : "memory" );                                   \
++      __res;                                                  \
++})
++
++#define _hypercall1(type, name, arg)                          \
++({                                                            \
++      type __res;                                             \
++      HYPERCALL_ARG(arg, 1);                                  \
++      asm volatile (                                          \
++              "call " HYPERCALL_ASM_OPERAND "2"               \
++              : "=a" (__res), "+r" (__arg1)                   \
++              : HYPERCALL_C_OPERAND(name)                     \
++              : "memory" );                                   \
++      __res;                                                  \
++})
++
++#define _hypercall2(type, name, a1, a2)                               \
++({                                                            \
++      type __res;                                             \
++      HYPERCALL_ARG(a1, 1);                                   \
++      HYPERCALL_ARG(a2, 2);                                   \
++      asm volatile (                                          \
++              "call " HYPERCALL_ASM_OPERAND "3"               \
++              : "=a" (__res), "+r" (__arg1), "+r" (__arg2)    \
++              : HYPERCALL_C_OPERAND(name)                     \
++              : "memory" );                                   \
++      __res;                                                  \
++})
++
++#define _hypercall3(type, name, a1, a2, a3)                   \
++({                                                            \
++      type __res;                                             \
++      HYPERCALL_ARG(a1, 1);                                   \
++      HYPERCALL_ARG(a2, 2);                                   \
++      HYPERCALL_ARG(a3, 3);                                   \
++      asm volatile (                                          \
++              "call " HYPERCALL_ASM_OPERAND "4"               \
++              : "=a" (__res), "+r" (__arg1),                  \
++                "+r" (__arg2), "+r" (__arg3)                  \
++              : HYPERCALL_C_OPERAND(name)                     \
++              : "memory" );                                   \
++      __res;                                                  \
++})
++
++#define _hypercall4(type, name, a1, a2, a3, a4)                       \
++({                                                            \
++      type __res;                                             \
++      HYPERCALL_ARG(a1, 1);                                   \
++      HYPERCALL_ARG(a2, 2);                                   \
++      HYPERCALL_ARG(a3, 3);                                   \
++      HYPERCALL_ARG(a4, 4);                                   \
++      asm volatile (                                          \
++              "call " HYPERCALL_ASM_OPERAND "5"               \
++              : "=a" (__res), "+r" (__arg1), "+r" (__arg2),   \
++                "+r" (__arg3), "+r" (__arg4)                  \
++              : HYPERCALL_C_OPERAND(name)                     \
++              : "memory" );                                   \
++      __res;                                                  \
++})
++
++#define _hypercall5(type, name, a1, a2, a3, a4, a5)           \
++({                                                            \
++      type __res;                                             \
++      HYPERCALL_ARG(a1, 1);                                   \
++      HYPERCALL_ARG(a2, 2);                                   \
++      HYPERCALL_ARG(a3, 3);                                   \
++      HYPERCALL_ARG(a4, 4);                                   \
++      HYPERCALL_ARG(a5, 5);                                   \
++      asm volatile (                                          \
++              "call " HYPERCALL_ASM_OPERAND "6"               \
++              : "=a" (__res), "+r" (__arg1), "+r" (__arg2),   \
++                "+r" (__arg3), "+r" (__arg4), "+r" (__arg5)   \
++              : HYPERCALL_C_OPERAND(name)                     \
++              : "memory" );                                   \
++      __res;                                                  \
++})
++
++#define _hypercall(type, op, a1, a2, a3, a4, a5)              \
++({                                                            \
++      type __res;                                             \
++      HYPERCALL_ARG(a1, 1);                                   \
++      HYPERCALL_ARG(a2, 2);                                   \
++      HYPERCALL_ARG(a3, 3);                                   \
++      HYPERCALL_ARG(a4, 4);                                   \
++      HYPERCALL_ARG(a5, 5);                                   \
++      asm volatile (                                          \
++              "call *%6"                                      \
++              : "=a" (__res), "+r" (__arg1), "+r" (__arg2),   \
++                "+r" (__arg3), "+r" (__arg4), "+r" (__arg5)   \
++              : "g" (HYPERCALL_LOCATION(op))                  \
++              : "memory" );                                   \
++      __res;                                                  \
++})
++
++#ifdef CONFIG_X86_32
++# include "hypercall_32.h"
++#else
++# include "hypercall_64.h"
++#endif
++
++static inline int __must_check
++HYPERVISOR_set_trap_table(
++      const trap_info_t *table)
++{
++      return _hypercall1(int, set_trap_table, table);
++}
++
++static inline int __must_check
++HYPERVISOR_mmu_update(
++      mmu_update_t *req, unsigned int count, unsigned int *success_count,
++      domid_t domid)
++{
++      if (arch_use_lazy_mmu_mode())
++              return xen_multi_mmu_update(req, count, success_count, domid);
++      return _hypercall4(int, mmu_update, req, count, success_count, domid);
++}
++
++static inline int __must_check
++HYPERVISOR_mmuext_op(
++      struct mmuext_op *op, unsigned int count, unsigned int *success_count,
++      domid_t domid)
++{
++      if (arch_use_lazy_mmu_mode())
++              return xen_multi_mmuext_op(op, count, success_count, domid);
++      return _hypercall4(int, mmuext_op, op, count, success_count, domid);
++}
++
++static inline int __must_check
++HYPERVISOR_set_gdt(
++      unsigned long *frame_list, unsigned int entries)
++{
++      return _hypercall2(int, set_gdt, frame_list, entries);
++}
++
++static inline int __must_check
++HYPERVISOR_stack_switch(
++      unsigned long ss, unsigned long esp)
++{
++      return _hypercall2(int, stack_switch, ss, esp);
++}
++
++static inline int
++HYPERVISOR_fpu_taskswitch(
++      int set)
++{
++      return _hypercall1(int, fpu_taskswitch, set);
++}
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++static inline int __must_check
++HYPERVISOR_sched_op_compat(
++      int cmd, unsigned long arg)
++{
++      return _hypercall2(int, sched_op_compat, cmd, arg);
++}
++#endif
++
++static inline int __must_check
++HYPERVISOR_sched_op(
++      int cmd, void *arg)
++{
++      return _hypercall2(int, sched_op, cmd, arg);
++}
++
++static inline int __must_check
++HYPERVISOR_platform_op(
++      struct xen_platform_op *platform_op)
++{
++      platform_op->interface_version = XENPF_INTERFACE_VERSION;
++      return _hypercall1(int, platform_op, platform_op);
++}
++
++struct xen_mc;
++static inline int __must_check
++HYPERVISOR_mca(
++      struct xen_mc *mc_op)
++{
++      mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
++      return _hypercall1(int, mca, mc_op);
++}
++
++static inline int __must_check
++HYPERVISOR_set_debugreg(
++      unsigned int reg, unsigned long value)
++{
++      return _hypercall2(int, set_debugreg, reg, value);
++}
++
++static inline unsigned long __must_check
++HYPERVISOR_get_debugreg(
++      unsigned int reg)
++{
++      return _hypercall1(unsigned long, get_debugreg, reg);
++}
++
++static inline int __must_check
++HYPERVISOR_memory_op(
++      unsigned int cmd, void *arg)
++{
++      if (arch_use_lazy_mmu_mode())
++              xen_multicall_flush();
++      return _hypercall2(int, memory_op, cmd, arg);
++}
++
++static inline int __must_check
++HYPERVISOR_multicall(
++      multicall_entry_t *call_list, unsigned int nr_calls)
++{
++      return _hypercall2(int, multicall, call_list, nr_calls);
++}
++
++static inline int __must_check
++HYPERVISOR_event_channel_op(
++      int cmd, void *arg)
++{
++      int rc = _hypercall2(int, event_channel_op, cmd, arg);
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (unlikely(rc == -ENOSYS)) {
++              struct evtchn_op op;
++              op.cmd = cmd;
++              memcpy(&op.u, arg, sizeof(op.u));
++              rc = _hypercall1(int, event_channel_op_compat, &op);
++              memcpy(arg, &op.u, sizeof(op.u));
++      }
++#endif
++
++      return rc;
++}
++
++static inline int __must_check
++HYPERVISOR_xen_version(
++      int cmd, void *arg)
++{
++      return _hypercall2(int, xen_version, cmd, arg);
++}
++
++static inline int __must_check
++HYPERVISOR_console_io(
++      int cmd, unsigned int count, char *str)
++{
++      return _hypercall3(int, console_io, cmd, count, str);
++}
++
++static inline int __must_check
++HYPERVISOR_physdev_op(
++      int cmd, void *arg)
++{
++      int rc = _hypercall2(int, physdev_op, cmd, arg);
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (unlikely(rc == -ENOSYS)) {
++              struct physdev_op op;
++              op.cmd = cmd;
++              memcpy(&op.u, arg, sizeof(op.u));
++              rc = _hypercall1(int, physdev_op_compat, &op);
++              memcpy(arg, &op.u, sizeof(op.u));
++      }
++#endif
++
++      return rc;
++}
++
++static inline int __must_check
++HYPERVISOR_grant_table_op(
++      unsigned int cmd, void *uop, unsigned int count)
++{
++      bool fixup = false;
++      int rc;
++
++      if (arch_use_lazy_mmu_mode())
++              xen_multicall_flush();
++#ifdef GNTTABOP_map_grant_ref
++      if (cmd == GNTTABOP_map_grant_ref)
++#endif
++              fixup = gnttab_pre_map_adjust(cmd, uop, count);
++      rc = _hypercall3(int, grant_table_op, cmd, uop, count);
++      if (rc == 0 && fixup)
++              rc = gnttab_post_map_adjust(uop, count);
++      return rc;
++}
++
++static inline int __must_check
++HYPERVISOR_vm_assist(
++      unsigned int cmd, unsigned int type)
++{
++      return _hypercall2(int, vm_assist, cmd, type);
++}
++
++static inline int __must_check
++HYPERVISOR_vcpu_op(
++      int cmd, unsigned int vcpuid, void *extra_args)
++{
++      return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
++}
++
++static inline int __must_check
++HYPERVISOR_suspend(
++      unsigned long srec)
++{
++      struct sched_shutdown sched_shutdown = {
++              .reason = SHUTDOWN_suspend
++      };
++
++      int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
++                           &sched_shutdown, srec);
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (rc == -ENOSYS)
++              rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
++                               SHUTDOWN_suspend, srec);
++#endif
++
++      return rc;
++}
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++static inline int
++HYPERVISOR_nmi_op(
++      unsigned long op, void *arg)
++{
++      return _hypercall2(int, nmi_op, op, arg);
++}
++#endif
++
++#ifndef CONFIG_XEN
++static inline unsigned long __must_check
++HYPERVISOR_hvm_op(
++    int op, void *arg)
++{
++    return _hypercall2(unsigned long, hvm_op, op, arg);
++}
++#endif
++
++static inline int __must_check
++HYPERVISOR_callback_op(
++      int cmd, const void *arg)
++{
++      return _hypercall2(int, callback_op, cmd, arg);
++}
++
++static inline int __must_check
++HYPERVISOR_xenoprof_op(
++      int op, void *arg)
++{
++      return _hypercall2(int, xenoprof_op, op, arg);
++}
++
++static inline int __must_check
++HYPERVISOR_kexec_op(
++      unsigned long op, void *args)
++{
++      return _hypercall2(int, kexec_op, op, args);
++}
++
++static inline int __must_check
++HYPERVISOR_tmem_op(
++      struct tmem_op *op)
++{
++      return _hypercall1(int, tmem_op, op);
++}
++
++#endif /* __HYPERCALL_H__ */
diff --cc arch/x86/include/mach-xen/asm/hypercall_32.h

index 0000000,0000000..3987b2e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall_32.h
@@@ -1,0 -1,0 +1,62 @@@
++#define HYPERCALL_arg1 "ebx"
++#define HYPERCALL_arg2 "ecx"
++#define HYPERCALL_arg3 "edx"
++#define HYPERCALL_arg4 "esi"
++#define HYPERCALL_arg5 "edi"
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++static inline int __must_check
++HYPERVISOR_set_callbacks(
++      unsigned long event_selector, unsigned long event_address,
++      unsigned long failsafe_selector, unsigned long failsafe_address)
++{
++      return _hypercall4(int, set_callbacks,
++                         event_selector, event_address,
++                         failsafe_selector, failsafe_address);
++}
++#endif
++
++static inline long __must_check
++HYPERVISOR_set_timer_op(
++      u64 timeout)
++{
++      return _hypercall2(long, set_timer_op,
++                         (unsigned long)timeout,
++                         (unsigned long)(timeout>>32));
++}
++
++static inline int __must_check
++HYPERVISOR_update_descriptor(
++      u64 ma, u64 desc)
++{
++      return _hypercall4(int, update_descriptor,
++                         (unsigned long)ma, (unsigned long)(ma>>32),
++                         (unsigned long)desc, (unsigned long)(desc>>32));
++}
++
++static inline int __must_check
++HYPERVISOR_update_va_mapping(
++      unsigned long va, pte_t new_val, unsigned long flags)
++{
++      unsigned long pte_hi = 0;
++
++      if (arch_use_lazy_mmu_mode())
++              return xen_multi_update_va_mapping(va, new_val, flags);
++#ifdef CONFIG_X86_PAE
++      pte_hi = new_val.pte_high;
++#endif
++      return _hypercall4(int, update_va_mapping, va,
++                         new_val.pte_low, pte_hi, flags);
++}
++
++static inline int __must_check
++HYPERVISOR_update_va_mapping_otherdomain(
++      unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
++{
++      unsigned long pte_hi = 0;
++#ifdef CONFIG_X86_PAE
++      pte_hi = new_val.pte_high;
++#endif
++      return _hypercall5(int, update_va_mapping_otherdomain, va,
++                         new_val.pte_low, pte_hi, flags, domid);
++}
diff --cc arch/x86/include/mach-xen/asm/hypercall_64.h

index 0000000,0000000..97d9445

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall_64.h
@@@ -1,0 -1,0 +1,54 @@@
++#define HYPERCALL_arg1 "rdi"
++#define HYPERCALL_arg2 "rsi"
++#define HYPERCALL_arg3 "rdx"
++#define HYPERCALL_arg4 "r10"
++#define HYPERCALL_arg5 "r8"
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++static inline int __must_check
++HYPERVISOR_set_callbacks(
++      unsigned long event_address, unsigned long failsafe_address, 
++      unsigned long syscall_address)
++{
++      return _hypercall3(int, set_callbacks,
++                         event_address, failsafe_address, syscall_address);
++}
++#endif
++
++static inline long __must_check
++HYPERVISOR_set_timer_op(
++      u64 timeout)
++{
++      return _hypercall1(long, set_timer_op, timeout);
++}
++
++static inline int __must_check
++HYPERVISOR_update_descriptor(
++      unsigned long ma, unsigned long word)
++{
++      return _hypercall2(int, update_descriptor, ma, word);
++}
++
++static inline int __must_check
++HYPERVISOR_update_va_mapping(
++      unsigned long va, pte_t new_val, unsigned long flags)
++{
++      if (arch_use_lazy_mmu_mode())
++              return xen_multi_update_va_mapping(va, new_val, flags);
++      return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
++}
++
++static inline int __must_check
++HYPERVISOR_update_va_mapping_otherdomain(
++      unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
++{
++      return _hypercall4(int, update_va_mapping_otherdomain, va,
++                         new_val.pte, flags, domid);
++}
++
++static inline int __must_check
++HYPERVISOR_set_segment_base(
++      int reg, unsigned long value)
++{
++      return _hypercall2(int, set_segment_base, reg, value);
++}
diff --cc arch/x86/include/mach-xen/asm/hypervisor.h

index 0000000,0000000..322b96e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypervisor.h
@@@ -1,0 -1,0 +1,394 @@@
++/******************************************************************************
++ * hypervisor.h
++ * 
++ * Linux-specific hypervisor handling.
++ * 
++ * Copyright (c) 2002-2004, K A Fraser
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __HYPERVISOR_H__
++#define __HYPERVISOR_H__
++
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/platform.h>
++#include <xen/interface/event_channel.h>
++#include <xen/interface/physdev.h>
++#include <xen/interface/sched.h>
++#include <xen/interface/nmi.h>
++#include <xen/interface/tmem.h>
++#include <xen/interface/vcpu.h>
++#include <xen/interface/arch-x86/xen-mca.h>
++#include <asm/percpu.h>
++#include <asm/ptrace.h>
++#include <asm/pgtable_types.h>
++
++extern shared_info_t *HYPERVISOR_shared_info;
++
++#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
++DECLARE_PER_CPU(struct vcpu_info, vcpu_info);
++#define vcpu_info(cpu) (&per_cpu(vcpu_info, cpu))
++#define current_vcpu_info() (&__get_cpu_var(vcpu_info))
++#define vcpu_info_read(fld) percpu_read(vcpu_info.fld)
++#define vcpu_info_write(fld, val) percpu_write(vcpu_info.fld, val)
++#define vcpu_info_xchg(fld, val) percpu_xchg(vcpu_info.fld, val)
++void setup_vcpu_info(unsigned int cpu);
++void adjust_boot_vcpu_info(void);
++#else
++#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu))
++#ifdef CONFIG_SMP
++#define current_vcpu_info() vcpu_info(smp_processor_id())
++#else
++#define current_vcpu_info() vcpu_info(0)
++#endif
++#define vcpu_info_read(fld) (current_vcpu_info()->fld)
++#define vcpu_info_write(fld, val) (current_vcpu_info()->fld = (val))
++static inline void setup_vcpu_info(unsigned int cpu) {}
++#endif
++
++#ifdef CONFIG_X86_32
++extern unsigned long hypervisor_virt_start;
++#endif
++
++/* arch/xen/i386/kernel/setup.c */
++extern start_info_t *xen_start_info;
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
++#else
++#define is_initial_xendomain() 0
++#endif
++
++#define init_hypervisor(c) ((void)(c))
++#define init_hypervisor_platform() init_hypervisor(&boot_cpu_data)
++
++DECLARE_PER_CPU(struct vcpu_runstate_info, runstate);
++#define vcpu_running(cpu) (per_cpu(runstate.state, cpu) == RUNSTATE_running)
++
++/* arch/xen/kernel/evtchn.c */
++/* Force a proper event-channel callback from Xen. */
++void force_evtchn_callback(void);
++
++/* arch/xen/kernel/process.c */
++void xen_cpu_idle (void);
++
++/* arch/xen/i386/kernel/hypervisor.c */
++void do_hypervisor_callback(struct pt_regs *regs);
++
++/* arch/xen/i386/mm/hypervisor.c */
++/*
++ * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
++ * be MACHINE addresses.
++ */
++
++void xen_pt_switch(pgd_t *);
++void xen_new_user_pt(pgd_t *); /* x86_64 only */
++void xen_load_gs(unsigned int selector); /* x86_64 only */
++void xen_tlb_flush(void);
++void xen_invlpg(unsigned long ptr);
++
++void xen_l1_entry_update(pte_t *ptr, pte_t val);
++void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
++void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
++void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
++void xen_pgd_pin(pgd_t *);
++void xen_pgd_unpin(pgd_t *);
++
++void xen_init_pgd_pin(void);
++
++void xen_set_ldt(const void *ptr, unsigned int ents);
++
++#ifdef CONFIG_SMP
++#include <linux/cpumask.h>
++void xen_tlb_flush_all(void);
++void xen_invlpg_all(unsigned long ptr);
++void xen_tlb_flush_mask(const cpumask_t *mask);
++void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr);
++#else
++#define xen_tlb_flush_all xen_tlb_flush
++#define xen_invlpg_all xen_invlpg
++#endif
++
++/* Returns zero on success else negative errno. */
++int xen_create_contiguous_region(
++    unsigned long vstart, unsigned int order, unsigned int address_bits);
++void xen_destroy_contiguous_region(
++    unsigned long vstart, unsigned int order);
++int early_create_contiguous_region(unsigned long pfn, unsigned int order,
++                                 unsigned int address_bits);
++
++struct page;
++
++int xen_limit_pages_to_max_mfn(
++      struct page *pages, unsigned int order, unsigned int address_bits);
++
++bool __cold hypervisor_oom(void);
++
++/* Turn jiffies into Xen system time. */
++u64 jiffies_to_st(unsigned long jiffies);
++
++#ifdef CONFIG_XEN_SCRUB_PAGES
++void scrub_pages(void *, unsigned int);
++#else
++#define scrub_pages(_p,_n) ((void)0)
++#endif
++
++#if defined(CONFIG_XEN) && !defined(MODULE)
++
++DECLARE_PER_CPU(bool, xen_lazy_mmu);
++
++void xen_multicall_flush(void);
++
++int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t,
++                                           unsigned long flags);
++int __must_check xen_multi_mmu_update(mmu_update_t *, unsigned int count,
++                                    unsigned int *success_count, domid_t);
++int __must_check xen_multi_mmuext_op(struct mmuext_op *, unsigned int count,
++                                   unsigned int *success_count, domid_t);
++
++#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
++static inline void arch_enter_lazy_mmu_mode(void)
++{
++      percpu_write(xen_lazy_mmu, true);
++}
++
++static inline void arch_leave_lazy_mmu_mode(void)
++{
++      percpu_write(xen_lazy_mmu, false);
++      xen_multicall_flush();
++}
++
++#define arch_use_lazy_mmu_mode() unlikely(percpu_read(xen_lazy_mmu))
++
++#if 0 /* All uses are in places potentially called asynchronously, but
++       * asynchronous code should rather not make use of lazy mode at all.
++       * Therefore, all uses of this function get commented out, proper
++       * detection of asynchronous invocations is added whereever needed,
++       * and this function is disabled to catch any new (improper) uses.
++       */
++static inline void arch_flush_lazy_mmu_mode(void)
++{
++      if (arch_use_lazy_mmu_mode())
++              xen_multicall_flush();
++}
++#endif
++
++#else /* !CONFIG_XEN || MODULE */
++
++static inline void xen_multicall_flush(void) {}
++#define arch_use_lazy_mmu_mode() false
++#define xen_multi_update_va_mapping(...) ({ BUG(); -ENOSYS; })
++#define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; })
++#define xen_multi_mmuext_op(...) ({ BUG(); -ENOSYS; })
++
++#endif /* CONFIG_XEN && !MODULE */
++
++#ifdef CONFIG_XEN
++
++struct gnttab_map_grant_ref;
++bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
++                         unsigned int count);
++#if CONFIG_XEN_COMPAT < 0x030400
++int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
++#else
++static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
++                                       unsigned int count)
++{
++      BUG();
++      return -ENOSYS;
++}
++#endif
++
++#else /* !CONFIG_XEN */
++
++#define gnttab_pre_map_adjust(...) false
++#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
++
++#endif /* CONFIG_XEN */
++
++#if defined(CONFIG_X86_64)
++#define MULTI_UVMFLAGS_INDEX 2
++#define MULTI_UVMDOMID_INDEX 3
++#else
++#define MULTI_UVMFLAGS_INDEX 3
++#define MULTI_UVMDOMID_INDEX 4
++#endif
++
++#ifdef CONFIG_XEN
++#define is_running_on_xen() 1
++extern char hypercall_page[PAGE_SIZE];
++#else
++extern char *hypercall_stubs;
++#define is_running_on_xen() (!!hypercall_stubs)
++#endif
++
++#include <xen/hypercall.h>
++
++static inline int
++HYPERVISOR_yield(
++      void)
++{
++      int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (rc == -ENOSYS)
++              rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
++#endif
++
++      return rc;
++}
++
++static inline int
++HYPERVISOR_block(
++      void)
++{
++      int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (rc == -ENOSYS)
++              rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
++#endif
++
++      return rc;
++}
++
++static inline void __noreturn
++HYPERVISOR_shutdown(
++      unsigned int reason)
++{
++      struct sched_shutdown sched_shutdown = {
++              .reason = reason
++      };
++
++      VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown));
++#if CONFIG_XEN_COMPAT <= 0x030002
++      VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason));
++#endif
++      /* Don't recurse needlessly. */
++      BUG_ON(reason != SHUTDOWN_crash);
++      for(;;);
++}
++
++static inline int __must_check
++HYPERVISOR_poll(
++      evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
++{
++      int rc;
++      struct sched_poll sched_poll = {
++              .nr_ports = nr_ports,
++              .timeout = jiffies_to_st(timeout)
++      };
++      set_xen_guest_handle(sched_poll.ports, ports);
++
++      rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (rc == -ENOSYS)
++              rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
++#endif
++
++      return rc;
++}
++
++static inline int __must_check
++HYPERVISOR_poll_no_timeout(
++      evtchn_port_t *ports, unsigned int nr_ports)
++{
++      int rc;
++      struct sched_poll sched_poll = {
++              .nr_ports = nr_ports
++      };
++      set_xen_guest_handle(sched_poll.ports, ports);
++
++      rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (rc == -ENOSYS)
++              rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
++#endif
++
++      return rc;
++}
++
++#ifdef CONFIG_XEN
++
++static inline void
++MULTI_update_va_mapping(
++    multicall_entry_t *mcl, unsigned long va,
++    pte_t new_val, unsigned long flags)
++{
++    mcl->op = __HYPERVISOR_update_va_mapping;
++    mcl->args[0] = va;
++#if defined(CONFIG_X86_64)
++    mcl->args[1] = new_val.pte;
++#elif defined(CONFIG_X86_PAE)
++    mcl->args[1] = new_val.pte_low;
++    mcl->args[2] = new_val.pte_high;
++#else
++    mcl->args[1] = new_val.pte_low;
++    mcl->args[2] = 0;
++#endif
++    mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
++}
++
++static inline void
++MULTI_mmu_update(multicall_entry_t *mcl, mmu_update_t *req,
++               unsigned int count, unsigned int *success_count,
++               domid_t domid)
++{
++    mcl->op = __HYPERVISOR_mmu_update;
++    mcl->args[0] = (unsigned long)req;
++    mcl->args[1] = count;
++    mcl->args[2] = (unsigned long)success_count;
++    mcl->args[3] = domid;
++}
++
++static inline void
++MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
++                   void *uop, unsigned int count)
++{
++    mcl->op = __HYPERVISOR_grant_table_op;
++    mcl->args[0] = cmd;
++    mcl->args[1] = (unsigned long)uop;
++    mcl->args[2] = count;
++}
++
++#else /* !defined(CONFIG_XEN) */
++
++/* Multicalls not supported for HVM guests. */
++#define MULTI_update_va_mapping(a,b,c,d) ((void)0)
++#define MULTI_grant_table_op(a,b,c,d) ((void)0)
++
++#endif
++
++#define uvm_multi(cpumask) ((unsigned long)cpumask_bits(cpumask) | UVMF_MULTI)
++
++#ifdef LINUX
++/* drivers/staging/ use Windows-style types, including VOID */
++#undef VOID
++#endif
++
++#endif /* __HYPERVISOR_H__ */
diff --cc arch/x86/include/mach-xen/asm/io.h

index 0000000,0000000..36de970

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/io.h
@@@ -1,0 -1,0 +1,363 @@@
++#ifndef _ASM_X86_IO_H
++#define _ASM_X86_IO_H
++
++/*
++ * This file contains the definitions for the x86 IO instructions
++ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
++ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
++ * versions of the single-IO instructions (inb_p/inw_p/..).
++ *
++ * This file is not meant to be obfuscating: it's just complicated
++ * to (a) handle it all in a way that makes gcc able to optimize it
++ * as well as possible and (b) trying to avoid writing the same thing
++ * over and over again with slight variations and possibly making a
++ * mistake somewhere.
++ */
++
++/*
++ * Thanks to James van Artsdalen for a better timing-fix than
++ * the two short jumps: using outb's to a nonexistent port seems
++ * to guarantee better timings even on fast machines.
++ *
++ * On the other hand, I'd like to be sure of a non-existent port:
++ * I feel a bit unsafe about using 0x80 (should be safe, though)
++ *
++ *            Linus
++ */
++
++ /*
++  *  Bit simplified and optimized by Jan Hubicka
++  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
++  *
++  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
++  *  isa_read[wl] and isa_write[wl] fixed
++  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
++  */
++
++#define ARCH_HAS_IOREMAP_WC
++
++#include <linux/string.h>
++#include <linux/compiler.h>
++#include <asm-generic/int-ll64.h>
++#include <asm/page.h>
++#ifdef __KERNEL__
++#include <asm/fixmap.h>
++#endif
++
++#define build_mmio_read(name, size, type, reg, barrier) \
++static inline type name(const volatile void __iomem *addr) \
++{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
++:"m" (*(volatile type __force *)addr) barrier); return ret; }
++
++#define build_mmio_write(name, size, type, reg, barrier) \
++static inline void name(type val, volatile void __iomem *addr) \
++{ asm volatile("mov" size " %0,%1": :reg (val), \
++"m" (*(volatile type __force *)addr) barrier); }
++
++build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
++build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
++build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
++
++build_mmio_read(__readb, "b", unsigned char, "=q", )
++build_mmio_read(__readw, "w", unsigned short, "=r", )
++build_mmio_read(__readl, "l", unsigned int, "=r", )
++
++build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
++build_mmio_write(writew, "w", unsigned short, "r", :"memory")
++build_mmio_write(writel, "l", unsigned int, "r", :"memory")
++
++build_mmio_write(__writeb, "b", unsigned char, "q", )
++build_mmio_write(__writew, "w", unsigned short, "r", )
++build_mmio_write(__writel, "l", unsigned int, "r", )
++
++#define readb_relaxed(a) __readb(a)
++#define readw_relaxed(a) __readw(a)
++#define readl_relaxed(a) __readl(a)
++#define __raw_readb __readb
++#define __raw_readw __readw
++#define __raw_readl __readl
++
++#define __raw_writeb __writeb
++#define __raw_writew __writew
++#define __raw_writel __writel
++
++#define mmiowb() barrier()
++
++#ifdef CONFIG_X86_64
++
++build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
++build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
++
++#else
++
++static inline __u64 readq(const volatile void __iomem *addr)
++{
++      const volatile u32 __iomem *p = addr;
++      u32 low, high;
++
++      low = readl(p);
++      high = readl(p + 1);
++
++      return low + ((u64)high << 32);
++}
++
++static inline void writeq(__u64 val, volatile void __iomem *addr)
++{
++      writel(val, addr);
++      writel(val >> 32, addr+4);
++}
++
++#endif
++
++#define readq_relaxed(a)      readq(a)
++
++#define __raw_readq(a)                readq(a)
++#define __raw_writeq(val, addr)       writeq(val, addr)
++
++/* Let people know that we have them */
++#define readq                 readq
++#define writeq                        writeq
++
++/**
++ *    virt_to_phys    -       map virtual addresses to physical
++ *    @address: address to remap
++ *
++ *    The returned physical address is the physical (CPU) mapping for
++ *    the memory address given. It is only valid to use this function on
++ *    addresses directly mapped or allocated via kmalloc.
++ *
++ *    This function does not give bus mappings for DMA transfers. In
++ *    almost all conceivable cases a device driver should not be using
++ *    this function
++ */
++
++static inline phys_addr_t virt_to_phys(volatile void *address)
++{
++      return __pa(address);
++}
++
++/**
++ *    phys_to_virt    -       map physical address to virtual
++ *    @address: address to remap
++ *
++ *    The returned virtual address is a current CPU mapping for
++ *    the memory address given. It is only valid to use this function on
++ *    addresses that have a kernel mapping
++ *
++ *    This function does not handle bus mappings for DMA transfers. In
++ *    almost all conceivable cases a device driver should not be using
++ *    this function
++ */
++
++static inline void *phys_to_virt(phys_addr_t address)
++{
++      return __va(address);
++}
++
++/*
++ * Change "struct page" to physical address.
++ */
++#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
++#undef page_to_phys
++#define page_to_phys(page)     (phys_to_machine(page_to_pseudophys(page)))
++#define page_to_bus(page)      (phys_to_machine(page_to_pseudophys(page)))
++
++/*
++ * ISA I/O bus memory addresses are 1:1 with the physical address.
++ * However, we truncate the address to unsigned int to avoid undesirable
++ * promitions in legacy drivers.
++ */
++#define isa_virt_to_bus(_x) ({ \
++      unsigned long _va_ = (unsigned long)(_x); \
++      _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
++      ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
++      : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
++#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
++
++/*
++ * However PCI ones are not necessarily 1:1 and therefore these interfaces
++ * are forbidden in portable PCI drivers.
++ *
++ * Allow them on x86 for legacy drivers, though.
++ */
++#define virt_to_bus(_x) phys_to_machine(__pa(_x))
++#define bus_to_virt(_x) __va(machine_to_phys(_x))
++
++/**
++ * ioremap     -   map bus memory into CPU space
++ * @offset:    bus address of the memory
++ * @size:      size of the resource to map
++ *
++ * ioremap performs a platform specific sequence of operations to
++ * make bus memory CPU accessible via the readb/readw/readl/writeb/
++ * writew/writel functions and the other mmio helpers. The returned
++ * address is not guaranteed to be usable directly as a virtual
++ * address.
++ *
++ * If the area you are trying to map is a PCI BAR you should have a
++ * look at pci_iomap().
++ */
++extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
++extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
++extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
++                              unsigned long prot_val);
++
++/*
++ * The default ioremap() behavior is non-cached:
++ */
++static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
++{
++      return ioremap_nocache(offset, size);
++}
++
++extern void iounmap(volatile void __iomem *addr);
++
++extern void set_iounmap_nonlazy(void);
++
++#ifdef __KERNEL__
++
++#include <asm-generic/iomap.h>
++
++#include <linux/vmalloc.h>
++
++/*
++ * Convert a virtual cached pointer to an uncached pointer
++ */
++#define xlate_dev_kmem_ptr(p) p
++
++static inline void
++memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
++{
++      memset((void __force *)addr, val, count);
++}
++
++static inline void
++memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
++{
++      memcpy(dst, (const void __force *)src, count);
++}
++
++static inline void
++memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
++{
++      memcpy((void __force *)dst, src, count);
++}
++
++/*
++ *    Cache management
++ *
++ *    This needed for two cases
++ *    1. Out of order aware processors
++ *    2. Accidentally out of order processors (PPro errata #51)
++ */
++
++static inline void flush_write_buffers(void)
++{
++#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
++      asm volatile("lock; addl $0,0(%%esp)": : :"memory");
++#endif
++}
++
++#endif /* __KERNEL__ */
++
++extern void native_io_delay(void);
++
++extern int io_delay_type;
++extern void io_delay_init(void);
++
++static inline void slow_down_io(void)
++{
++      native_io_delay();
++#ifdef REALLY_SLOW_IO
++      native_io_delay();
++      native_io_delay();
++      native_io_delay();
++#endif
++}
++
++#define BUILDIO(bwl, bw, type)                                                \
++static inline void out##bwl(unsigned type value, int port)            \
++{                                                                     \
++      asm volatile("out" #bwl " %" #bw "0, %w1"                       \
++                   : : "a"(value), "Nd"(port));                       \
++}                                                                     \
++                                                                      \
++static inline unsigned type in##bwl(int port)                         \
++{                                                                     \
++      unsigned type value;                                            \
++      asm volatile("in" #bwl " %w1, %" #bw "0"                        \
++                   : "=a"(value) : "Nd"(port));                       \
++      return value;                                                   \
++}                                                                     \
++                                                                      \
++static inline void out##bwl##_p(unsigned type value, int port)                \
++{                                                                     \
++      out##bwl(value, port);                                          \
++      slow_down_io();                                                 \
++}                                                                     \
++                                                                      \
++static inline unsigned type in##bwl##_p(int port)                     \
++{                                                                     \
++      unsigned type value = in##bwl(port);                            \
++      slow_down_io();                                                 \
++      return value;                                                   \
++}                                                                     \
++                                                                      \
++static inline void outs##bwl(int port, const void *addr, unsigned long count) \
++{                                                                     \
++      asm volatile("rep; outs" #bwl                                   \
++                   : "+S"(addr), "+c"(count) : "d"(port));            \
++}                                                                     \
++                                                                      \
++static inline void ins##bwl(int port, void *addr, unsigned long count)        \
++{                                                                     \
++      asm volatile("rep; ins" #bwl                                    \
++                   : "+D"(addr), "+c"(count) : "d"(port));            \
++}
++
++BUILDIO(b, b, char)
++BUILDIO(w, w, short)
++BUILDIO(l, , int)
++
++#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
++
++/* We will be supplying our own /dev/mem implementation */
++#define ARCH_HAS_DEV_MEM
++
++#define bvec_to_pseudophys(bv)         (page_to_pseudophys((bv)->bv_page) + \
++                                (unsigned long)(bv)->bv_offset)
++
++#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
++      (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
++       && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
++          == bvec_to_pseudophys(vec2))
++
++#endif
++
++extern void *xlate_dev_mem_ptr(unsigned long phys);
++extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
++
++extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
++                                   unsigned long prot_val);
++extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
++
++/*
++ * early_ioremap() and early_iounmap() are for temporary early boot-time
++ * mappings, before the real ioremap() is functional.
++ * A boot-time mapping is currently limited to at most 16 pages.
++ */
++extern void early_ioremap_init(void);
++extern void early_ioremap_reset(void);
++extern void __iomem *early_ioremap(resource_size_t phys_addr,
++                                 unsigned long size);
++extern void __iomem *early_memremap(resource_size_t phys_addr,
++                                  unsigned long size);
++extern void __iomem *early_memremap_ro(resource_size_t phys_addr,
++                                     unsigned long size);
++extern void early_iounmap(void __iomem *addr, unsigned long size);
++extern void fixup_early_ioremap(void);
++extern bool is_early_ioremap_ptep(pte_t *ptep);
++
++#define IO_SPACE_LIMIT 0xffff
++
++#endif /* _ASM_X86_IO_H */
diff --cc arch/x86/include/mach-xen/asm/ipi.h

index 0000000,0000000..4bdda1d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/ipi.h
@@@ -1,0 -1,0 +1,13 @@@
++#ifndef _ASM_X86_IPI_H
++#define _ASM_X86_IPI_H
++
++#include <asm/hw_irq.h>
++#include <asm/smp.h>
++
++void xen_send_IPI_mask(const struct cpumask *, int vector);
++void xen_send_IPI_mask_allbutself(const struct cpumask *, int vector);
++void xen_send_IPI_allbutself(int vector);
++void xen_send_IPI_all(int vector);
++void xen_send_IPI_self(int vector);
++
++#endif /* _ASM_X86_IPI_H */
diff --cc arch/x86/include/mach-xen/asm/irq_vectors.h

index 0000000,0000000..c1a4484

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/irq_vectors.h
@@@ -1,0 -1,0 +1,107 @@@
++#ifndef _ASM_X86_IRQ_VECTORS_H
++#define _ASM_X86_IRQ_VECTORS_H
++
++#define MCE_VECTOR                    0x12
++
++#define IA32_SYSCALL_VECTOR           0x80
++#ifdef CONFIG_X86_32
++# define SYSCALL_VECTOR                       0x80
++#endif
++
++#define RESCHEDULE_VECTOR             0
++#define CALL_FUNCTION_VECTOR          1
++#define NMI_VECTOR                    0x02
++#define CALL_FUNC_SINGLE_VECTOR               3
++#define REBOOT_VECTOR                 4
++#ifdef CONFIG_IRQ_WORK
++#define IRQ_WORK_VECTOR                       5
++#define NR_IPIS                               6
++#else
++#define NR_IPIS                               5
++#endif
++
++/*
++ * The maximum number of vectors supported by i386 processors
++ * is limited to 256. For processors other than i386, NR_VECTORS
++ * should be changed accordingly.
++ */
++#define NR_VECTORS                     256
++
++#define       FIRST_VM86_IRQ                     3
++#define LAST_VM86_IRQ                   15
++
++#ifndef __ASSEMBLY__
++static inline int invalid_vm86_irq(int irq)
++{
++      return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
++}
++#endif
++
++/*
++ * Size the maximum number of interrupts.
++ *
++ * If the irq_desc[] array has a sparse layout, we can size things
++ * generously - it scales up linearly with the maximum number of CPUs,
++ * and the maximum number of IO-APICs, whichever is higher.
++ *
++ * In other cases we size more conservatively, to not create too large
++ * static arrays.
++ */
++
++#define NR_IRQS_LEGACY                          16
++
++/*
++ * The flat IRQ space is divided into two regions:
++ *  1. A one-to-one mapping of real physical IRQs. This space is only used
++ *     if we have physical device-access privilege. This region is at the
++ *     start of the IRQ space so that existing device drivers do not need
++ *     to be modified to translate physical IRQ numbers into our IRQ space.
++ *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
++ *     are bound using the provided bind/unbind functions.
++ */
++#define PIRQ_BASE                     0
++/* PHYSDEVOP_pirq_eoi_gmfn restriction: */
++#define PIRQ_MAX(n) ((n) < (1 << (PAGE_SHIFT + 3)) - NR_VECTORS \
++                 ? (n) : (1 << (PAGE_SHIFT + 3)) - NR_VECTORS)
++
++#define IO_APIC_VECTOR_LIMIT          PIRQ_MAX(32 * MAX_IO_APICS)
++
++#ifdef CONFIG_SPARSE_IRQ
++# define CPU_VECTOR_LIMIT             PIRQ_MAX(64 * NR_CPUS)
++#else
++# define CPU_VECTOR_LIMIT             PIRQ_MAX(32 * NR_CPUS)
++#endif
++
++#if defined(CONFIG_X86_IO_APIC)
++# ifdef CONFIG_SPARSE_IRQ
++#  define NR_PIRQS                    (NR_VECTORS + IO_APIC_VECTOR_LIMIT)
++# else
++#  define NR_PIRQS                                    \
++      (CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ?      \
++              (NR_VECTORS + CPU_VECTOR_LIMIT)  :      \
++              (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
++# endif
++#elif defined(CONFIG_XEN_PCIDEV_FRONTEND)
++# define NR_PIRQS                     (NR_VECTORS + CPU_VECTOR_LIMIT)
++#else /* !CONFIG_X86_IO_APIC: */
++# define NR_PIRQS                     NR_IRQS_LEGACY
++#endif
++
++#ifndef __ASSEMBLY__
++#ifdef CONFIG_SPARSE_IRQ
++extern int nr_pirqs;
++#else
++# define nr_pirqs                     NR_PIRQS
++#endif
++#endif
++
++#define DYNIRQ_BASE                   (PIRQ_BASE + nr_pirqs)
++#ifdef CONFIG_SPARSE_IRQ
++#define NR_DYNIRQS                    (CPU_VECTOR_LIMIT + CONFIG_XEN_NR_GUEST_DEVICES)
++#else
++#define NR_DYNIRQS                    (64 + CONFIG_XEN_NR_GUEST_DEVICES)
++#endif
++
++#define NR_IRQS                               (NR_PIRQS + NR_DYNIRQS)
++
++#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --cc arch/x86/include/mach-xen/asm/irqflags.h

index 0000000,0000000..95d336f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/irqflags.h
@@@ -1,0 -1,0 +1,212 @@@
++#ifndef _X86_IRQFLAGS_H_
++#define _X86_IRQFLAGS_H_
++
++#include <asm/smp-processor-id.h>
++
++#ifndef __ASSEMBLY__
++#include <linux/types.h>
++#include <xen/interface/vcpu.h>
++/*
++ * The use of 'barrier' in the following reflects their use as local-lock
++ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
++ * critical operations are executed. All critical operations must complete
++ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
++ * includes these barriers, for example.
++ */
++
++#define xen_save_fl(void) vcpu_info_read(evtchn_upcall_mask)
++
++#define xen_restore_fl(f)                                     \
++do {                                                          \
++      vcpu_info_t *_vcpu;                                     \
++      barrier();                                              \
++      _vcpu = current_vcpu_info();                            \
++      if ((_vcpu->evtchn_upcall_mask = (f)) == 0) {           \
++              barrier(); /* unmask then check (avoid races) */\
++              if (unlikely(_vcpu->evtchn_upcall_pending))     \
++                      force_evtchn_callback();                \
++      }                                                       \
++} while (0)
++
++#define xen_irq_disable()                                     \
++do {                                                          \
++      vcpu_info_write(evtchn_upcall_mask, 1);                 \
++      barrier();                                              \
++} while (0)
++
++#define xen_irq_enable()                                      \
++do {                                                          \
++      vcpu_info_t *_vcpu;                                     \
++      barrier();                                              \
++      _vcpu = current_vcpu_info();                            \
++      _vcpu->evtchn_upcall_mask = 0;                          \
++      barrier(); /* unmask then check (avoid races) */        \
++      if (unlikely(_vcpu->evtchn_upcall_pending))             \
++              force_evtchn_callback();                        \
++} while (0)
++
++#define arch_local_save_flags() xen_save_fl()
++
++#define arch_local_irq_restore(flags) xen_restore_fl(flags)
++
++#define arch_local_irq_disable()      xen_irq_disable()
++
++#define arch_local_irq_enable() xen_irq_enable()
++
++/*
++ * Used in the idle loop; sti takes one instruction cycle
++ * to complete:
++ */
++#define arch_safe_halt HYPERVISOR_block
++
++/*
++ * Used when interrupts are already enabled or to
++ * shutdown the processor:
++ */
++#define halt() VOID(irqs_disabled()                                   \
++                  ? HYPERVISOR_vcpu_op(VCPUOP_down,                   \
++                                       smp_processor_id(), NULL)      \
++                  : 0)
++
++/*
++ * For spinlocks, etc:
++ */
++#define arch_local_irq_save()                                         \
++({                                                                    \
++      unsigned long flags = arch_local_save_flags();                  \
++                                                                      \
++      arch_local_irq_disable();                                       \
++                                                                      \
++      flags;                                                          \
++})
++#else
++
++/* Offsets into shared_info_t. */
++#define evtchn_upcall_pending         /* 0 */
++#define evtchn_upcall_mask            1
++
++#ifdef CONFIG_X86_64
++# define __REG_si %rsi
++# define __CPU_num PER_CPU_VAR(cpu_number)
++#else
++# define __REG_si %esi
++# define __CPU_num TI_cpu(%ebp)
++#endif
++
++#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
++
++#define GET_VCPU_INFO         PER_CPU(vcpu_info, __REG_si)
++#define __DISABLE_INTERRUPTS  movb $1,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask)
++#define __ENABLE_INTERRUPTS   movb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask)
++#define __TEST_PENDING                cmpb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_pending+0)
++#define DISABLE_INTERRUPTS(clb)       __DISABLE_INTERRUPTS
++#define ENABLE_INTERRUPTS(clb)        __ENABLE_INTERRUPTS
++
++#define __SIZEOF_DISABLE_INTERRUPTS 8
++#define __SIZEOF_TEST_PENDING 8
++
++#else /* CONFIG_XEN_VCPU_INFO_PLACEMENT */
++
++#define sizeof_vcpu_shift     6
++
++#ifdef CONFIG_SMP
++#define GET_VCPU_INFO         movl __CPU_num,%esi                     ; \
++                              shl $sizeof_vcpu_shift,%esi             ; \
++                              add HYPERVISOR_shared_info,__REG_si
++#else
++#define GET_VCPU_INFO         mov HYPERVISOR_shared_info,__REG_si
++#endif
++
++#define __DISABLE_INTERRUPTS  movb $1,evtchn_upcall_mask(__REG_si)
++#define __ENABLE_INTERRUPTS   movb $0,evtchn_upcall_mask(__REG_si)
++#define __TEST_PENDING                testb $0xFF,evtchn_upcall_pending(__REG_si)
++#define DISABLE_INTERRUPTS(clb)       GET_VCPU_INFO                           ; \
++                              __DISABLE_INTERRUPTS
++#define ENABLE_INTERRUPTS(clb)        GET_VCPU_INFO                           ; \
++                              __ENABLE_INTERRUPTS
++
++#define __SIZEOF_DISABLE_INTERRUPTS 4
++#define __SIZEOF_TEST_PENDING 3
++
++#endif /* CONFIG_XEN_VCPU_INFO_PLACEMENT */
++
++#ifndef CONFIG_X86_64
++#define INTERRUPT_RETURN              iret
++#define ENABLE_INTERRUPTS_SYSEXIT                                       \
++      movb $0,evtchn_upcall_mask(%esi) /* __ENABLE_INTERRUPTS */      ; \
++sysexit_scrit:        /**** START OF SYSEXIT CRITICAL REGION ****/            ; \
++      cmpb $0,evtchn_upcall_pending(%esi) /* __TEST_PENDING */        ; \
++      jnz  14f        /* process more events if necessary... */       ; \
++      movl PT_ESI(%esp), %esi                                         ; \
++      sysexit                                                         ; \
++14:   movb $1,evtchn_upcall_mask(%esi) /* __DISABLE_INTERRUPTS */     ; \
++      TRACE_IRQS_OFF                                                  ; \
++sysexit_ecrit:        /**** END OF SYSEXIT CRITICAL REGION ****/              ; \
++      mov  $__KERNEL_PERCPU, %ecx                                     ; \
++      push %esp                                                       ; \
++      mov  %ecx, %fs                                                  ; \
++      SET_KERNEL_GS %ecx                                              ; \
++      call evtchn_do_upcall                                           ; \
++      add  $4,%esp                                                    ; \
++      jmp  ret_from_intr
++#endif
++
++
++#endif /* __ASSEMBLY__ */
++
++#ifndef __ASSEMBLY__
++static inline int arch_irqs_disabled_flags(unsigned long flags)
++{
++      return (flags != 0);
++}
++
++#define arch_irqs_disabled()                                          \
++({                                                                    \
++      unsigned long flags = arch_local_save_flags();                  \
++                                                                      \
++      arch_irqs_disabled_flags(flags);                                \
++})
++
++#else
++
++#ifdef CONFIG_X86_64
++#define ARCH_LOCKDEP_SYS_EXIT         call lockdep_sys_exit_thunk
++#define ARCH_LOCKDEP_SYS_EXIT_IRQ     \
++      TRACE_IRQS_ON; \
++      ENABLE_INTERRUPTS(CLBR_NONE); \
++      SAVE_REST; \
++      LOCKDEP_SYS_EXIT; \
++      RESTORE_REST; \
++      __DISABLE_INTERRUPTS; \
++      TRACE_IRQS_OFF;
++
++#else
++#define ARCH_LOCKDEP_SYS_EXIT                 \
++      pushl %eax;                             \
++      pushl %ecx;                             \
++      pushl %edx;                             \
++      call lockdep_sys_exit;                  \
++      popl %edx;                              \
++      popl %ecx;                              \
++      popl %eax;
++
++#define ARCH_LOCKDEP_SYS_EXIT_IRQ
++#endif
++
++#ifdef CONFIG_TRACE_IRQFLAGS
++#  define TRACE_IRQS_ON               call trace_hardirqs_on_thunk;
++#  define TRACE_IRQS_OFF      call trace_hardirqs_off_thunk;
++#else
++#  define TRACE_IRQS_ON
++#  define TRACE_IRQS_OFF
++#endif
++#ifdef CONFIG_DEBUG_LOCK_ALLOC
++#  define LOCKDEP_SYS_EXIT    ARCH_LOCKDEP_SYS_EXIT
++#  define LOCKDEP_SYS_EXIT_IRQ        ARCH_LOCKDEP_SYS_EXIT_IRQ
++# else
++#  define LOCKDEP_SYS_EXIT
++#  define LOCKDEP_SYS_EXIT_IRQ
++# endif
++
++#endif /* __ASSEMBLY__ */
++#endif
diff --cc arch/x86/include/mach-xen/asm/mach_traps.h

index 0000000,0000000..94f750b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mach_traps.h
@@@ -1,0 -1,0 +1,37 @@@
++/*
++ *  include/asm-xen/asm-i386/mach-xen/mach_traps.h
++ *
++ *  Machine specific NMI handling for Xen
++ */
++#ifndef _MACH_TRAPS_H
++#define _MACH_TRAPS_H
++
++#include <linux/bitops.h>
++#include <xen/interface/nmi.h>
++
++#define NMI_REASON_SERR               0x80
++#define NMI_REASON_IOCHK      0x40
++#define NMI_REASON_MASK               (NMI_REASON_SERR | NMI_REASON_IOCHK)
++
++static inline void clear_serr_error(unsigned char reason) {}
++static inline void clear_io_check_error(unsigned char reason) {}
++
++static inline unsigned char get_nmi_reason(void)
++{
++      shared_info_t *s = HYPERVISOR_shared_info;
++      unsigned char reason = 0;
++
++      /* construct a value which looks like it came from
++       * port 0x61.
++       */
++      if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
++              reason |= NMI_REASON_IOCHK;
++      if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
++              reason |= NMI_REASON_SERR;
++
++        return reason;
++}
++
++static inline void reassert_nmi(void) {}
++
++#endif /* !_MACH_TRAPS_H */
diff --cc arch/x86/include/mach-xen/asm/maddr.h

index 0000000,0000000..03db2a0

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr.h
@@@ -1,0 -1,0 +1,155 @@@
++#ifndef _X86_MADDR_H
++#define _X86_MADDR_H
++
++#include <asm/asm.h>
++#include <asm/bug.h>
++#include <xen/features.h>
++#include <xen/interface/xen.h>
++
++/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
++#define INVALID_P2M_ENTRY     (~0UL)
++#define FOREIGN_FRAME_BIT     (1UL << (BITS_PER_LONG - 1))
++#define FOREIGN_FRAME(m)      ((m) | FOREIGN_FRAME_BIT)
++
++/* Definitions for machine and pseudophysical addresses. */
++#ifdef CONFIG_X86_PAE
++typedef unsigned long long paddr_t;
++typedef unsigned long long maddr_t;
++#else
++typedef unsigned long paddr_t;
++typedef unsigned long maddr_t;
++#endif
++
++#ifdef CONFIG_XEN
++
++extern unsigned long *phys_to_machine_mapping;
++extern unsigned long  max_mapnr;
++
++#undef machine_to_phys_mapping
++extern unsigned long *machine_to_phys_mapping;
++extern unsigned int   machine_to_phys_order;
++
++static inline unsigned long pfn_to_mfn(unsigned long pfn)
++{
++      if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
++              return pfn;
++      if (likely(max_mapnr))
++              BUG_ON(pfn >= max_mapnr);
++      return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
++}
++
++static inline int phys_to_machine_mapping_valid(unsigned long pfn)
++{
++      if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
++              return 1;
++      if (likely(max_mapnr))
++              BUG_ON(pfn >= max_mapnr);
++      return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
++}
++
++static inline unsigned long mfn_to_pfn(unsigned long mfn)
++{
++      unsigned long pfn;
++
++      if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
++              return mfn;
++
++      if (unlikely((mfn >> machine_to_phys_order) != 0))
++              return max_mapnr;
++
++      /* The array access can fail (e.g., device space beyond end of RAM). */
++      asm (
++              "1:     "_ASM_MOV" %1,%0\n"
++              "2:\n"
++              ".section .fixup,\"ax\"\n"
++              "3:     "_ASM_MOV" %2,%0\n"
++              "       jmp  2b\n"
++              ".previous\n"
++              _ASM_EXTABLE(1b,3b)
++              : "=r" (pfn)
++              : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
++
++      return pfn;
++}
++
++/*
++ * We detect special mappings in one of two ways:
++ *  1. If the MFN is an I/O page then Xen will set the m2p entry
++ *     to be outside our maximum possible pseudophys range.
++ *  2. If the MFN belongs to a different domain then we will certainly
++ *     not have MFN in our p2m table. Conversely, if the page is ours,
++ *     then we'll have p2m(m2p(MFN))==MFN.
++ * If we detect a special mapping then it doesn't have a 'struct page'.
++ * We force !pfn_valid() by returning an out-of-range pointer.
++ *
++ * NB. These checks require that, for any MFN that is not in our reservation,
++ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
++ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
++ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
++ *
++ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
++ *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
++ *      require. In all the cases we care about, the FOREIGN_FRAME bit is
++ *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
++ */
++static inline unsigned long mfn_to_local_pfn(phys_addr_t mfn)
++{
++      unsigned long pfn = mfn_to_pfn(mfn);
++      if (likely(pfn < max_mapnr)
++          && likely(!xen_feature(XENFEAT_auto_translated_physmap))
++          && unlikely(phys_to_machine_mapping[pfn] != mfn))
++              return max_mapnr; /* force !pfn_valid() */
++      return pfn;
++}
++
++static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
++{
++      if (likely(max_mapnr))
++              BUG_ON(pfn >= max_mapnr);
++      if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
++              BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
++              return;
++      }
++      phys_to_machine_mapping[pfn] = mfn;
++}
++
++static inline maddr_t phys_to_machine(paddr_t phys)
++{
++      maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
++      machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
++      return machine;
++}
++
++static inline paddr_t machine_to_phys(maddr_t machine)
++{
++      paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
++      phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
++      return phys;
++}
++
++#ifdef CONFIG_X86_32
++# include "maddr_32.h"
++#else
++# include "maddr_64.h"
++#endif
++
++#else /* !CONFIG_XEN */
++
++#define pfn_to_mfn(pfn) (pfn)
++#define mfn_to_pfn(mfn) (mfn)
++#define mfn_to_local_pfn(mfn) (mfn)
++#define set_phys_to_machine(pfn, mfn) ((void)0)
++#define phys_to_machine_mapping_valid(pfn) 1
++#define phys_to_machine(phys) ((maddr_t)(phys))
++#define machine_to_phys(mach) ((paddr_t)(mach))
++#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
++#define __pte_ma(x) __pte(x)
++
++#endif /* !CONFIG_XEN */
++
++/* VIRT <-> MACHINE conversion */
++#define virt_to_machine(v)    phys_to_machine(__pa(v))
++#define virt_to_mfn(v)                pfn_to_mfn(__pa(v) >> PAGE_SHIFT)
++#define mfn_to_virt(m)                __va(mfn_to_pfn(m) << PAGE_SHIFT)
++
++#endif /* _X86_MADDR_H */
diff --cc arch/x86/include/mach-xen/asm/maddr_32.h

index 0000000,0000000..de34d87

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr_32.h
@@@ -1,0 -1,0 +1,35 @@@
++#ifndef _I386_MADDR_H
++#define _I386_MADDR_H
++
++#ifdef CONFIG_X86_PAE
++static inline paddr_t pte_phys_to_machine(paddr_t phys)
++{
++      /*
++       * In PAE mode, the NX bit needs to be dealt with in the value
++       * passed to pfn_to_mfn(). On x86_64, we need to mask it off,
++       * but for i386 the conversion to ulong for the argument will
++       * clip it off.
++       */
++      maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
++      machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
++      return machine;
++}
++
++static inline paddr_t pte_machine_to_phys(maddr_t machine)
++{
++      /*
++       * In PAE mode, the NX bit needs to be dealt with in the value
++       * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
++       * but for i386 the conversion to ulong for the argument will
++       * clip it off.
++       */
++      paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
++      phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
++      return phys;
++}
++#else
++#define pte_phys_to_machine phys_to_machine
++#define pte_machine_to_phys machine_to_phys
++#endif
++
++#endif /* _I386_MADDR_H */
diff --cc arch/x86/include/mach-xen/asm/maddr_64.h

index 0000000,0000000..e2c271e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr_64.h
@@@ -1,0 -1,0 +1,21 @@@
++#ifndef _X86_64_MADDR_H
++#define _X86_64_MADDR_H
++
++static inline paddr_t pte_phys_to_machine(paddr_t phys)
++{
++      maddr_t machine;
++      machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
++      machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
++      return machine;
++}
++
++static inline paddr_t pte_machine_to_phys(maddr_t machine)
++{
++      paddr_t phys;
++      phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
++      phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
++      return phys;
++}
++
++#endif /* _X86_64_MADDR_H */
++
diff --cc arch/x86/include/mach-xen/asm/mmu_context.h

index 0000000,0000000..d13945a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mmu_context.h
@@@ -1,0 -1,0 +1,165 @@@
++#ifndef _ASM_X86_MMU_CONTEXT_H
++#define _ASM_X86_MMU_CONTEXT_H
++
++#include <asm/desc.h>
++#include <asm/atomic.h>
++#include <asm/pgalloc.h>
++#include <asm/tlbflush.h>
++
++void arch_exit_mmap(struct mm_struct *mm);
++void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
++
++void mm_pin(struct mm_struct *mm);
++void mm_unpin(struct mm_struct *mm);
++void mm_pin_all(void);
++
++static inline void xen_activate_mm(struct mm_struct *prev,
++                                 struct mm_struct *next)
++{
++      if (!PagePinned(virt_to_page(next->pgd)))
++              mm_pin(next);
++}
++
++/*
++ * Used for LDT copy/destruction.
++ */
++int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
++void destroy_context(struct mm_struct *mm);
++
++
++static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
++{
++#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
++      if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
++              percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
++#endif
++}
++
++#define prepare_arch_switch(next)     __prepare_arch_switch()
++
++static inline void __prepare_arch_switch(void)
++{
++#ifdef CONFIG_X86_32
++      /*
++       * Save away %gs. No need to save %fs, as it was saved on the
++       * stack on entry.  No need to save %es and %ds, as those are
++       * always kernel segments while inside the kernel.
++       */
++      lazy_save_gs(current->thread.gs);
++      lazy_load_gs(__KERNEL_STACK_CANARY);
++#else
++      /*
++       * Save away %es, %ds, %fs and %gs. Must happen before reload
++       * of cr3/ldt (i.e., not in __switch_to).
++       */
++      __asm__ __volatile__ (
++              "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
++              : "=m" (current->thread.es),
++                "=m" (current->thread.ds),
++                "=m" (current->thread.fsindex),
++                "=m" (current->thread.gsindex) );
++
++      if (current->thread.ds)
++              __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
++
++      if (current->thread.es)
++              __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
++
++      if (current->thread.fsindex) {
++              __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
++              current->thread.fs = 0;
++      }
++
++      if (current->thread.gsindex) {
++              load_gs_index(0);
++              current->thread.gs = 0;
++      }
++#endif
++}
++
++static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
++                           struct task_struct *tsk)
++{
++      unsigned cpu = smp_processor_id();
++      struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
++#ifdef CONFIG_X86_64
++      pgd_t *upgd;
++#endif
++
++      if (likely(prev != next)) {
++              BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
++                     !PagePinned(virt_to_page(next->pgd)));
++
++#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
++              percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
++              percpu_write(cpu_tlbstate.active_mm, next);
++#endif
++              cpumask_set_cpu(cpu, mm_cpumask(next));
++
++              /* Re-load page tables: load_cr3(next->pgd) */
++              op->cmd = MMUEXT_NEW_BASEPTR;
++              op->arg1.mfn = virt_to_mfn(next->pgd);
++              op++;
++
++              /* xen_new_user_pt(next->pgd) */
++#ifdef CONFIG_X86_64
++              op->cmd = MMUEXT_NEW_USER_BASEPTR;
++              upgd = __user_pgd(next->pgd);
++              op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0;
++              op++;
++#endif
++
++              /*
++               * load the LDT, if the LDT is different:
++               */
++              if (unlikely(prev->context.ldt != next->context.ldt)) {
++                      /* load_LDT_nolock(&next->context) */
++                      op->cmd = MMUEXT_SET_LDT;
++                      op->arg1.linear_addr = (unsigned long)next->context.ldt;
++                      op->arg2.nr_ents     = next->context.size;
++                      op++;
++              }
++
++              BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
++
++              /* stop TLB flushes for the previous mm */
++              cpumask_clear_cpu(cpu, mm_cpumask(prev));
++      }
++#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
++      else {
++              percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
++              BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
++
++              if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
++                      /* We were in lazy tlb mode and leave_mm disabled
++                       * tlb flush IPI delivery. We must reload CR3
++                       * to make sure to use no freed page tables.
++                       */
++                      load_cr3(next->pgd);
++                      xen_new_user_pt(next->pgd);
++                      load_LDT_nolock(&next->context);
++              }
++      }
++#endif
++}
++
++#define activate_mm(prev, next)                       \
++do {                                          \
++      xen_activate_mm(prev, next);            \
++      switch_mm((prev), (next), NULL);        \
++} while (0);
++
++#ifdef CONFIG_X86_32
++#define deactivate_mm(tsk, mm)                        \
++do {                                          \
++      lazy_load_gs(0);                        \
++} while (0)
++#else
++#define deactivate_mm(tsk, mm)                        \
++do {                                          \
++      load_gs_index(0);                       \
++      loadsegment(fs, 0);                     \
++} while (0)
++#endif
++
++#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --cc arch/x86/include/mach-xen/asm/mutex.h

index 0000000,0000000..ee9126e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mutex.h
@@@ -1,0 -1,0 +1,3 @@@
++#define arch_cpu_is_running(cpu) vcpu_running(cpu)
++
++#include_next <asm/mutex.h>
diff --cc arch/x86/include/mach-xen/asm/pci.h

index 0000000,0000000..d98d811

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pci.h
@@@ -1,0 -1,0 +1,173 @@@
++#ifndef _ASM_X86_PCI_H
++#define _ASM_X86_PCI_H
++
++#include <linux/mm.h> /* for struct page */
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/string.h>
++#include <asm/scatterlist.h>
++#include <asm/io.h>
++#include <asm/x86_init.h>
++
++#ifdef __KERNEL__
++
++struct pci_sysdata {
++      int             domain;         /* PCI domain */
++      int             node;           /* NUMA node */
++#ifdef CONFIG_X86_64
++      void            *iommu;         /* IOMMU private data */
++#endif
++#ifdef CONFIG_XEN_PCIDEV_FRONTEND
++      struct pcifront_device *pdev;
++#endif
++};
++
++extern int pci_routeirq;
++extern int noioapicquirk;
++extern int noioapicreroute;
++
++/* scan a bus after allocating a pci_sysdata for it */
++extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
++                                          int node);
++extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
++
++#ifdef CONFIG_PCI
++
++#ifdef CONFIG_PCI_DOMAINS
++static inline int pci_domain_nr(struct pci_bus *bus)
++{
++      struct pci_sysdata *sd = bus->sysdata;
++      return sd->domain;
++}
++
++static inline int pci_proc_domain(struct pci_bus *bus)
++{
++      return pci_domain_nr(bus);
++}
++#endif
++
++/* Can be used to override the logic in pci_scan_bus for skipping
++   already-configured bus numbers - to be used for buggy BIOSes
++   or architectures with incomplete PCI setup by the loader */
++
++extern unsigned int pcibios_assign_all_busses(void);
++extern int pci_legacy_init(void);
++# ifdef CONFIG_ACPI
++#  define x86_default_pci_init pci_acpi_init
++# else
++#  define x86_default_pci_init pci_legacy_init
++# endif
++#else
++# define pcibios_assign_all_busses()  0
++# define x86_default_pci_init         NULL
++#endif
++
++#include <asm/hypervisor.h>
++#define pcibios_scan_all_fns(a, b)    (!is_initial_xendomain())
++
++extern unsigned long pci_mem_start;
++#define PCIBIOS_MIN_IO                0x1000
++#define PCIBIOS_MIN_MEM               (pci_mem_start)
++
++#define PCIBIOS_MIN_CARDBUS_IO        0x4000
++
++extern int pcibios_enabled;
++void pcibios_config_init(void);
++struct pci_bus *pcibios_scan_root(int bus);
++
++void pcibios_set_master(struct pci_dev *dev);
++void pcibios_penalize_isa_irq(int irq, int active);
++struct irq_routing_table *pcibios_get_irq_routing_table(void);
++int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
++
++
++#define HAVE_PCI_MMAP
++extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
++                             enum pci_mmap_state mmap_state,
++                             int write_combine);
++
++
++#ifdef CONFIG_PCI
++extern void early_quirks(void);
++static inline void pci_dma_burst_advice(struct pci_dev *pdev,
++                                      enum pci_dma_burst_strategy *strat,
++                                      unsigned long *strategy_parameter)
++{
++      *strat = PCI_DMA_BURST_INFINITY;
++      *strategy_parameter = ~0UL;
++}
++#else
++static inline void early_quirks(void) { }
++#endif
++
++extern void pci_iommu_alloc(void);
++
++#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
++/* MSI arch specific hooks */
++static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
++{
++      return x86_msi.setup_msi_irqs(dev, nvec, type);
++}
++
++static inline void x86_teardown_msi_irqs(struct pci_dev *dev)
++{
++      x86_msi.teardown_msi_irqs(dev);
++}
++
++static inline void x86_teardown_msi_irq(unsigned int irq)
++{
++      x86_msi.teardown_msi_irq(irq);
++}
++#define arch_setup_msi_irqs x86_setup_msi_irqs
++#define arch_teardown_msi_irqs x86_teardown_msi_irqs
++#define arch_teardown_msi_irq x86_teardown_msi_irq
++/* implemented in arch/x86/kernel/apic/io_apic. */
++int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
++void native_teardown_msi_irq(unsigned int irq);
++/* default to the implementation in drivers/lib/msi.c */
++#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
++void default_teardown_msi_irqs(struct pci_dev *dev);
++#else
++#define native_setup_msi_irqs         NULL
++#define native_teardown_msi_irq               NULL
++#define default_teardown_msi_irqs     NULL
++#endif
++
++#define PCI_DMA_BUS_IS_PHYS 0
++
++#endif  /* __KERNEL__ */
++
++#ifdef CONFIG_X86_64
++#include "../../asm/pci_64.h"
++#endif
++
++void dma32_reserve_bootmem(void);
++
++/* implement the pci_ DMA API in terms of the generic device dma_ one */
++#include <asm-generic/pci-dma-compat.h>
++
++/* generic pci stuff */
++#include <asm-generic/pci.h>
++#define PCIBIOS_MAX_MEM_32 0xffffffff
++
++#ifdef CONFIG_NUMA
++/* Returns the node based on pci bus */
++static inline int __pcibus_to_node(const struct pci_bus *bus)
++{
++      const struct pci_sysdata *sd = bus->sysdata;
++
++      return sd->node;
++}
++
++static inline const struct cpumask *
++cpumask_of_pcibus(const struct pci_bus *bus)
++{
++      int node;
++
++      node = __pcibus_to_node(bus);
++      return (node == -1) ? cpu_online_mask :
++                            cpumask_of_node(node);
++}
++#endif
++
++#endif /* _ASM_X86_PCI_H */
diff --cc arch/x86/include/mach-xen/asm/perf_event.h

index 0000000,0000000..f7486d2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/perf_event.h
@@@ -1,0 -1,0 +1,37 @@@
++#ifndef _ASM_X86_PERF_EVENT_H
++#define _ASM_X86_PERF_EVENT_H
++
++#ifdef CONFIG_PERF_EVENTS
++
++/*
++ * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
++ * This flag is otherwise unused and ABI specified to be 0, so nobody should
++ * care what we do with it.
++ */
++#define PERF_EFLAGS_EXACT     (1UL << 3)
++
++#define perf_instruction_pointer(regs) instruction_pointer(regs)
++
++#define perf_misc_flags(regs) ({ \
++      struct pt_regs *_r_ = (regs); \
++      unsigned long _f_ = user_mode(_r_) ? PERF_RECORD_MISC_USER \
++                                         : PERF_RECORD_MISC_KERNEL; \
++      _r_->flags & PERF_EFLAGS_EXACT ? _f_ | PERF_RECORD_MISC_EXACT_IP : _f_; \
++})
++
++#include <asm/stacktrace.h>
++
++/*
++ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
++ * and the comment with PERF_EFLAGS_EXACT.
++ */
++#define perf_arch_fetch_caller_regs(regs, __ip)               {       \
++      (regs)->ip = (__ip);                                    \
++      (regs)->bp = caller_frame_pointer();                    \
++      (regs)->cs = __KERNEL_CS;                               \
++      regs->flags = 0;                                        \
++}
++
++#endif
++
++#endif /* _ASM_X86_PERF_EVENT_H */
diff --cc arch/x86/include/mach-xen/asm/pgalloc.h

index 0000000,0000000..3879075

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgalloc.h
@@@ -1,0 -1,0 +1,159 @@@
++#ifndef _ASM_X86_PGALLOC_H
++#define _ASM_X86_PGALLOC_H
++
++#include <linux/threads.h>
++#include <linux/mm.h>         /* for struct page */
++#include <linux/pagemap.h>
++
++#include <asm/io.h>           /* for phys_to_virt and page_to_pseudophys */
++
++static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
++static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
++
++static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)        {}
++static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)        {}
++static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
++                                          unsigned long start, unsigned long count) {}
++static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)        {}
++static inline void paravirt_release_pte(unsigned long pfn) {}
++static inline void paravirt_release_pmd(unsigned long pfn) {}
++static inline void paravirt_release_pud(unsigned long pfn) {}
++
++#ifdef CONFIG_X86_64
++void early_make_page_readonly(void *va, unsigned int feature);
++pmd_t *early_get_pmd(unsigned long va);
++#define make_lowmem_page_readonly make_page_readonly
++#define make_lowmem_page_writable make_page_writable
++#endif
++
++/*
++ * Flags to use when allocating a user page table page.
++ */
++extern gfp_t __userpte_alloc_gfp;
++
++/*
++ * Allocate and free page tables.
++ */
++extern pgd_t *pgd_alloc(struct mm_struct *);
++extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
++
++extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
++extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
++
++/* Should really implement gc for free page table pages. This could be
++   done with a reference count in struct page. */
++
++static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
++{
++      BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
++      make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
++      free_page((unsigned long)pte);
++}
++
++extern void __pte_free(pgtable_t);
++static inline void pte_free(struct mm_struct *mm, struct page *pte)
++{
++      __pte_free(pte);
++}
++
++extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
++
++static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
++                                unsigned long address)
++{
++      ___pte_free_tlb(tlb, pte);
++}
++
++static inline void pmd_populate_kernel(struct mm_struct *mm,
++                                     pmd_t *pmd, pte_t *pte)
++{
++      paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
++      set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
++}
++
++static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
++                              struct page *pte)
++{
++      unsigned long pfn = page_to_pfn(pte);
++      pmd_t ent = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
++
++      paravirt_alloc_pte(mm, pfn);
++      if (PagePinned(virt_to_page(pmd))) {
++#ifndef CONFIG_HIGHPTE
++              BUG_ON(PageHighMem(pte));
++#endif
++              set_pmd(pmd, ent);
++      } else
++              *pmd = ent;
++}
++
++#define pmd_pgtable(pmd) pmd_page(pmd)
++
++#if PAGETABLE_LEVELS > 2
++extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
++extern void __pmd_free(pgtable_t);
++
++static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
++{
++      BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
++      __pmd_free(virt_to_page(pmd));
++}
++
++extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
++
++static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
++                                unsigned long address)
++{
++      ___pmd_free_tlb(tlb, pmd);
++}
++
++#ifdef CONFIG_X86_PAE
++extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
++#else /* !CONFIG_X86_PAE */
++static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
++{
++      pud_t ent = __pud(_PAGE_TABLE | __pa(pmd));
++
++      paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
++      if (PagePinned(virt_to_page(pud)))
++              set_pud(pud, ent);
++      else
++              *pud = ent;
++}
++#endif        /* CONFIG_X86_PAE */
++
++#if PAGETABLE_LEVELS > 3
++static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
++{
++      pgd_t ent = __pgd(_PAGE_TABLE | __pa(pud));
++
++      paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
++      if (unlikely(PagePinned(virt_to_page(pgd))))
++              xen_l4_entry_update(pgd, ent);
++      else
++              *__user_pgd(pgd) = *pgd = ent;
++}
++
++static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
++{
++      return (pud_t *)pmd_alloc_one(mm, addr);
++}
++
++static inline void pud_free(struct mm_struct *mm, pud_t *pud)
++{
++      BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
++      __pmd_free(virt_to_page(pud));
++}
++
++extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
++
++static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
++                                unsigned long address)
++{
++      ___pud_free_tlb(tlb, pud);
++}
++
++#endif        /* PAGETABLE_LEVELS > 3 */
++#endif        /* PAGETABLE_LEVELS > 2 */
++
++#endif /* _ASM_X86_PGALLOC_H */
diff --cc arch/x86/include/mach-xen/asm/pgtable-3level.h

index 0000000,0000000..71e906c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable-3level.h
@@@ -1,0 -1,0 +1,152 @@@
++#ifndef _ASM_X86_PGTABLE_3LEVEL_H
++#define _ASM_X86_PGTABLE_3LEVEL_H
++
++/*
++ * Intel Physical Address Extension (PAE) Mode - three-level page
++ * tables on PPro+ CPUs.
++ *
++ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
++ */
++
++#define pte_ERROR(e)                                                  \
++      printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n",                \
++              __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
++#define pmd_ERROR(e)                                                  \
++      printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n",                \
++             __FILE__, __LINE__, &(e), __pmd_val(e),                  \
++             (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
++#define pgd_ERROR(e)                                                  \
++      printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n",                \
++             __FILE__, __LINE__, &(e), __pgd_val(e),                  \
++             (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
++
++/* Rules for using set_pte: the pte being assigned *must* be
++ * either not present or in a state where the hardware will
++ * not attempt to update the pte.  In places where this is
++ * not possible, use pte_get_and_clear to obtain the old pte
++ * value and then use set_pte to update it.  -ben
++ */
++
++static inline void xen_set_pte(pte_t *ptep, pte_t pte)
++{
++      ptep->pte_high = pte.pte_high;
++      smp_wmb();
++      ptep->pte_low = pte.pte_low;
++}
++
++static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
++{
++      xen_l2_entry_update(pmdp, pmd);
++}
++
++static inline void xen_set_pud(pud_t *pudp, pud_t pud)
++{
++      xen_l3_entry_update(pudp, pud);
++}
++
++/*
++ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
++ * entry, so clear the bottom half first and enforce ordering with a compiler
++ * barrier.
++ */
++static inline void __xen_pte_clear(pte_t *ptep)
++{
++      ptep->pte_low = 0;
++      smp_wmb();
++      ptep->pte_high = 0;
++}
++
++#define xen_pmd_clear(pmd)                    \
++({                                            \
++      pmd_t *__pmdp = (pmd);                  \
++      PagePinned(virt_to_page(__pmdp))        \
++      ? set_pmd(__pmdp, __pmd(0))             \
++      : (void)(*__pmdp = __pmd(0));           \
++})
++
++static inline void __xen_pud_clear(pud_t *pudp)
++{
++      set_pud(pudp, __pud(0));
++
++      /*
++       * According to Intel App note "TLBs, Paging-Structure Caches,
++       * and Their Invalidation", April 2007, document 317080-001,
++       * section 8.1: in PAE mode we explicitly have to flush the
++       * TLB via cr3 if the top-level pgd is changed...
++       *
++       * Currently all places where pud_clear() is called either have
++       * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
++       * pud_clear_bad()), so we don't need TLB flush here.
++       */
++}
++
++#define xen_pud_clear(pudp)                   \
++({                                            \
++      pud_t *__pudp = (pudp);                 \
++      PagePinned(virt_to_page(__pudp))        \
++      ? __xen_pud_clear(__pudp)               \
++      : (void)(*__pudp = __pud(0));           \
++})
++
++#ifdef CONFIG_SMP
++static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
++{
++      uint64_t val = __pte_val(res);
++      if (__cmpxchg64(&ptep->pte, val, 0) != val) {
++              /* xchg acts as a barrier before the setting of the high bits */
++              res.pte_low = xchg(&ptep->pte_low, 0);
++              res.pte_high = ptep->pte_high;
++              ptep->pte_high = 0;
++      }
++      return res;
++}
++#else
++#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
++#endif
++
++#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
++                       ((_pte).pte_high << (32-PAGE_SHIFT)))
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++#ifdef CONFIG_SMP
++union split_pmd {
++      struct {
++              u32 pmd_low;
++              u32 pmd_high;
++      };
++      pmd_t pmd;
++};
++static inline pmd_t xen_pmdp_get_and_clear(pmd_t *pmdp)
++{
++      union split_pmd res, *orig = (union split_pmd *)pmdp;
++
++      /* xchg acts as a barrier before setting of the high bits */
++      res.pmd_low = xchg(&orig->pmd_low, 0);
++      res.pmd_high = orig->pmd_high;
++      orig->pmd_high = 0;
++
++      return res.pmd;
++}
++#else
++#define xen_pmdp_get_and_clear(xp) xen_local_pmdp_get_and_clear(xp)
++#endif
++#endif
++
++/*
++ * Bits 0, 6 and 7 are taken in the low part of the pte,
++ * put the 32 bits of offset into the high part.
++ */
++#define pte_to_pgoff(pte) ((pte).pte_high)
++#define pgoff_to_pte(off)                                             \
++      ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
++#define PTE_FILE_MAX_BITS       32
++
++/* Encode and de-code a swap entry */
++#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
++#define __swp_type(x)                 (((x).val) & 0x1f)
++#define __swp_offset(x)                       ((x).val >> 5)
++#define __swp_entry(type, offset)     ((swp_entry_t){(type) | (offset) << 5})
++#define __pte_to_swp_entry(pte)               ((swp_entry_t){ (pte).pte_high })
++#define __swp_entry_to_pte(x)         ((pte_t){ { .pte_high = (x).val } })
++
++#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --cc arch/x86/include/mach-xen/asm/pgtable-3level_types.h

index 0000000,0000000..36d6f2b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable-3level_types.h
@@@ -1,0 -1,0 +1,44 @@@
++#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
++#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
++
++#ifndef __ASSEMBLY__
++#include <linux/types.h>
++
++typedef u64   pteval_t;
++typedef u64   pmdval_t;
++typedef u64   pudval_t;
++typedef u64   pgdval_t;
++typedef u64   pgprotval_t;
++
++typedef union {
++      struct {
++              unsigned long pte_low, pte_high;
++      };
++      pteval_t pte;
++} pte_t;
++#endif        /* !__ASSEMBLY__ */
++
++#define SHARED_KERNEL_PMD     0
++
++#define PAGETABLE_LEVELS      3
++
++/*
++ * PGDIR_SHIFT determines what a top-level page table entry can map
++ */
++#define PGDIR_SHIFT   30
++#define PTRS_PER_PGD  4
++
++/*
++ * PMD_SHIFT determines the size of the area a middle-level
++ * page table can map
++ */
++#define PMD_SHIFT     21
++#define PTRS_PER_PMD  512
++
++/*
++ * entries per page directory level
++ */
++#define PTRS_PER_PTE  512
++
++
++#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --cc arch/x86/include/mach-xen/asm/pgtable.h

index 0000000,0000000..c6af25d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable.h
@@@ -1,0 -1,0 +1,885 @@@
++#ifndef _ASM_X86_PGTABLE_H
++#define _ASM_X86_PGTABLE_H
++
++#include <asm/page.h>
++#include <asm/e820.h>
++
++#include <asm/pgtable_types.h>
++
++/*
++ * Macro to mark a page protection value as UC-
++ */
++#define pgprot_noncached(prot)                                        \
++      ((boot_cpu_data.x86 > 3)                                \
++       ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS))  \
++       : (prot))
++
++#ifndef __ASSEMBLY__
++
++#include <asm/x86_init.h>
++
++/*
++ * ZERO_PAGE is a global shared page that is always zero: used
++ * for zero-mapped memory areas etc..
++ */
++extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
++#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
++
++extern spinlock_t pgd_lock;
++extern struct list_head pgd_list;
++
++extern struct mm_struct *pgd_page_get_mm(struct page *page);
++
++#define set_pte(ptep, pte)            xen_set_pte(ptep, pte)
++#define set_pte_at(mm, addr, ptep, pte)       xen_set_pte_at(mm, addr, ptep, pte)
++#define set_pmd_at(mm, addr, pmdp, pmd)       xen_set_pmd_at(mm, addr, pmdp, pmd)
++
++#define set_pmd(pmdp, pmd)            xen_set_pmd(pmdp, pmd)
++
++#ifndef __PAGETABLE_PUD_FOLDED
++#define set_pgd(pgdp, pgd)            xen_set_pgd(pgdp, pgd)
++#define pgd_clear(pgd)                        xen_pgd_clear(pgd)
++#endif
++
++#ifndef set_pud
++# define set_pud(pudp, pud)           xen_set_pud(pudp, pud)
++#endif
++
++#ifndef __PAGETABLE_PMD_FOLDED
++#define pud_clear(pud)                        xen_pud_clear(pud)
++#endif
++
++#define pte_clear(mm, addr, ptep)     xen_pte_clear(mm, addr, ptep)
++#define pmd_clear(pmd)                        xen_pmd_clear(pmd)
++
++#define pte_update(mm, addr, ptep)              do { } while (0)
++#define pte_update_defer(mm, addr, ptep)        do { } while (0)
++#define pmd_update(mm, addr, ptep)              do { } while (0)
++#define pmd_update_defer(mm, addr, ptep)        do { } while (0)
++
++#define pgd_val(x)    xen_pgd_val(x)
++#define __pgd(x)      xen_make_pgd(x)
++
++#ifndef __PAGETABLE_PUD_FOLDED
++#define pud_val(x)    xen_pud_val(x)
++#define __pud(x)      xen_make_pud(x)
++#endif
++
++#ifndef __PAGETABLE_PMD_FOLDED
++#define pmd_val(x)    xen_pmd_val(x)
++#define __pmd(x)      xen_make_pmd(x)
++#endif
++
++#define pte_val(x)    xen_pte_val(x)
++#define __pte(x)      xen_make_pte(x)
++
++#define arch_end_context_switch(prev) do {} while(0)
++
++/*
++ * The following only work if pte_present() is true.
++ * Undefined behaviour if not..
++ */
++static inline int pte_dirty(pte_t pte)
++{
++      return pte_flags(pte) & _PAGE_DIRTY;
++}
++
++static inline int pte_young(pte_t pte)
++{
++      return pte_flags(pte) & _PAGE_ACCESSED;
++}
++
++static inline int pmd_young(pmd_t pmd)
++{
++      return pmd_flags(pmd) & _PAGE_ACCESSED;
++}
++
++static inline int pte_write(pte_t pte)
++{
++      return pte_flags(pte) & _PAGE_RW;
++}
++
++static inline int pte_file(pte_t pte)
++{
++      return pte_flags(pte) & _PAGE_FILE;
++}
++
++static inline int pte_huge(pte_t pte)
++{
++      return pte_flags(pte) & _PAGE_PSE;
++}
++
++static inline int pte_global(pte_t pte)
++{
++      return 0;
++}
++
++static inline int pte_exec(pte_t pte)
++{
++      return !(pte_flags(pte) & _PAGE_NX);
++}
++
++static inline int pte_special(pte_t pte)
++{
++      return pte_flags(pte) & _PAGE_SPECIAL;
++}
++
++#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
++      __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
++#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IOMAP ? max_mapnr : \
++                     (_pte).pte_low & _PAGE_PRESENT ?           \
++                     mfn_to_local_pfn(__pte_mfn(_pte)) :        \
++                     __pte_mfn(_pte))
++
++#define pte_page(pte) pfn_to_page(pte_pfn(pte))
++
++static inline unsigned long pmd_pfn(pmd_t pmd)
++{
++      return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
++}
++
++static inline int pmd_large(pmd_t pte)
++{
++      return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
++              (_PAGE_PSE | _PAGE_PRESENT);
++}
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++static inline int pmd_trans_splitting(pmd_t pmd)
++{
++      return pmd_val(pmd) & _PAGE_SPLITTING;
++}
++
++static inline int pmd_trans_huge(pmd_t pmd)
++{
++      return pmd_val(pmd) & _PAGE_PSE;
++}
++
++static inline int has_transparent_hugepage(void)
++{
++      return cpu_has_pse;
++}
++#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
++
++static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
++{
++      pteval_t v = __pte_val(pte);
++
++      return __pte_ma(v | set);
++}
++
++static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
++{
++      pteval_t v = __pte_val(pte);
++
++      return __pte_ma(v & ~clear);
++}
++
++static inline pte_t pte_mkclean(pte_t pte)
++{
++      return pte_clear_flags(pte, _PAGE_DIRTY);
++}
++
++static inline pte_t pte_mkold(pte_t pte)
++{
++      return pte_clear_flags(pte, _PAGE_ACCESSED);
++}
++
++static inline pte_t pte_wrprotect(pte_t pte)
++{
++      return pte_clear_flags(pte, _PAGE_RW);
++}
++
++static inline pte_t pte_mkexec(pte_t pte)
++{
++      return pte_clear_flags(pte, _PAGE_NX);
++}
++
++static inline pte_t pte_mkdirty(pte_t pte)
++{
++      return pte_set_flags(pte, _PAGE_DIRTY);
++}
++
++static inline pte_t pte_mkyoung(pte_t pte)
++{
++      return pte_set_flags(pte, _PAGE_ACCESSED);
++}
++
++static inline pte_t pte_mkwrite(pte_t pte)
++{
++      return pte_set_flags(pte, _PAGE_RW);
++}
++
++static inline pte_t pte_mkhuge(pte_t pte)
++{
++      return pte_set_flags(pte, _PAGE_PSE);
++}
++
++static inline pte_t pte_clrhuge(pte_t pte)
++{
++      return pte_clear_flags(pte, _PAGE_PSE);
++}
++
++static inline pte_t pte_mkglobal(pte_t pte)
++{
++      return pte;
++}
++
++static inline pte_t pte_clrglobal(pte_t pte)
++{
++      return pte;
++}
++
++static inline pte_t pte_mkspecial(pte_t pte)
++{
++      return pte_set_flags(pte, _PAGE_SPECIAL);
++}
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
++{
++      pmdval_t v = native_pmd_val(pmd);
++
++      return __pmd(v | set);
++}
++
++static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
++{
++      pmdval_t v = native_pmd_val(pmd);
++
++      return __pmd(v & ~clear);
++}
++
++static inline pmd_t pmd_mkold(pmd_t pmd)
++{
++      return pmd_clear_flags(pmd, _PAGE_ACCESSED);
++}
++
++static inline pmd_t pmd_wrprotect(pmd_t pmd)
++{
++      return pmd_clear_flags(pmd, _PAGE_RW);
++}
++
++static inline pmd_t pmd_mkdirty(pmd_t pmd)
++{
++      return pmd_set_flags(pmd, _PAGE_DIRTY);
++}
++
++static inline pmd_t pmd_mkhuge(pmd_t pmd)
++{
++      return pmd_set_flags(pmd, _PAGE_PSE);
++}
++
++static inline pmd_t pmd_mkyoung(pmd_t pmd)
++{
++      return pmd_set_flags(pmd, _PAGE_ACCESSED);
++}
++
++static inline pmd_t pmd_mkwrite(pmd_t pmd)
++{
++      return pmd_set_flags(pmd, _PAGE_RW);
++}
++
++static inline pmd_t pmd_mknotpresent(pmd_t pmd)
++{
++      return pmd_clear_flags(pmd, _PAGE_PRESENT);
++}
++#endif
++
++/*
++ * Mask out unsupported bits in a present pgprot.  Non-present pgprots
++ * can use those bits for other purposes, so leave them be.
++ */
++static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
++{
++      pgprotval_t protval = pgprot_val(pgprot);
++
++      if (protval & _PAGE_PRESENT)
++              protval &= __supported_pte_mask;
++
++      return protval;
++}
++
++static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
++{
++      return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
++                   massage_pgprot(pgprot));
++}
++
++static inline pte_t pfn_pte_ma(phys_addr_t page_nr, pgprot_t pgprot)
++{
++      return __pte_ma((page_nr << PAGE_SHIFT) | massage_pgprot(pgprot));
++}
++
++static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
++{
++      return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
++                   massage_pgprot(pgprot));
++}
++
++static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
++{
++      pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
++
++      val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
++
++      return __pte(val);
++}
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
++{
++      pmdval_t val = pmd_val(pmd);
++
++      val &= _HPAGE_CHG_MASK;
++      val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
++
++      return __pmd(val);
++}
++#endif
++
++/* mprotect needs to preserve PAT bits when updating vm_page_prot */
++#define pgprot_modify pgprot_modify
++static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
++{
++      pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
++      pgprotval_t addbits = pgprot_val(newprot);
++      return __pgprot(preservebits | addbits);
++}
++
++#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
++
++#define canon_pgprot(p) __pgprot(massage_pgprot(p))
++
++static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
++                                       unsigned long flags,
++                                       unsigned long new_flags)
++{
++      /*
++       * PAT type is always WB for untracked ranges, so no need to check.
++       */
++      if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
++              return 1;
++
++      /*
++       * Certain new memtypes are not allowed with certain
++       * requested memtype:
++       * - request is uncached, return cannot be write-back
++       * - request is write-combine, return cannot be write-back
++       */
++      if ((flags == _PAGE_CACHE_UC_MINUS &&
++           new_flags == _PAGE_CACHE_WB) ||
++          (flags == _PAGE_CACHE_WC &&
++           new_flags == _PAGE_CACHE_WB)) {
++              return 0;
++      }
++
++      return 1;
++}
++
++pmd_t *populate_extra_pmd(unsigned long vaddr);
++pte_t *populate_extra_pte(unsigned long vaddr);
++#endif        /* __ASSEMBLY__ */
++
++#ifdef CONFIG_X86_32
++# include "pgtable_32.h"
++#else
++# include "pgtable_64.h"
++#endif
++
++#ifndef __ASSEMBLY__
++#include <linux/mm_types.h>
++
++static inline int pte_none(pte_t pte)
++{
++      return !pte.pte;
++}
++
++#define __HAVE_ARCH_PTE_SAME
++static inline int pte_same(pte_t a, pte_t b)
++{
++      return a.pte == b.pte;
++}
++
++static inline int pte_present(pte_t a)
++{
++      return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
++}
++
++static inline int pte_hidden(pte_t pte)
++{
++      return pte_flags(pte) & _PAGE_HIDDEN;
++}
++
++static inline int pmd_present(pmd_t pmd)
++{
++#if CONFIG_XEN_COMPAT <= 0x030002
++/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
++   can temporarily clear it. */
++      return __pmd_val(pmd) != 0;
++#else
++      return pmd_flags(pmd) & _PAGE_PRESENT;
++#endif
++}
++
++static inline int pmd_none(pmd_t pmd)
++{
++      /* Only check low word on 32-bit platforms, since it might be
++         out of sync with upper half. */
++      return (unsigned long)__pmd_val(pmd) == 0;
++}
++
++static inline unsigned long pmd_page_vaddr(pmd_t pmd)
++{
++      return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
++}
++
++/*
++ * Currently stuck as a macro due to indirect forward reference to
++ * linux/mmzone.h's __section_mem_map_addr() definition:
++ */
++#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
++
++/*
++ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
++ *
++ * this macro returns the index of the entry in the pmd page which would
++ * control the given virtual address
++ */
++static inline unsigned long pmd_index(unsigned long address)
++{
++      return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
++}
++
++/*
++ * Conversion functions: convert a page and protection to a page entry,
++ * and a page entry and page directory to the page they refer to.
++ *
++ * (Currently stuck as a macro because of indirect forward reference
++ * to linux/mm.h:page_to_nid())
++ */
++#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
++
++/*
++ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
++ *
++ * this function returns the index of the entry in the pte page which would
++ * control the given virtual address
++ */
++static inline unsigned long pte_index(unsigned long address)
++{
++      return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
++}
++
++static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
++{
++      return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
++}
++
++static inline int pmd_bad(pmd_t pmd)
++{
++#if CONFIG_XEN_COMPAT <= 0x030002
++      return (pmd_flags(pmd) & ~_PAGE_USER & ~_PAGE_PRESENT)
++             != (_KERNPG_TABLE & ~_PAGE_PRESENT);
++#else
++      return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
++#endif
++}
++
++static inline unsigned long pages_to_mb(unsigned long npg)
++{
++      return npg >> (20 - PAGE_SHIFT);
++}
++
++#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)       \
++      direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
++
++#if PAGETABLE_LEVELS > 2
++static inline int pud_none(pud_t pud)
++{
++      return __pud_val(pud) == 0;
++}
++
++static inline int pud_present(pud_t pud)
++{
++      return pud_flags(pud) & _PAGE_PRESENT;
++}
++
++static inline unsigned long pud_page_vaddr(pud_t pud)
++{
++      return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
++}
++
++/*
++ * Currently stuck as a macro due to indirect forward reference to
++ * linux/mmzone.h's __section_mem_map_addr() definition:
++ */
++#define pud_page(pud)         pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
++
++/* Find an entry in the second-level page table.. */
++static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
++{
++      return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
++}
++
++static inline int pud_large(pud_t pud)
++{
++      return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
++              (_PAGE_PSE | _PAGE_PRESENT);
++}
++
++static inline int pud_bad(pud_t pud)
++{
++      return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
++}
++#else
++static inline int pud_large(pud_t pud)
++{
++      return 0;
++}
++#endif        /* PAGETABLE_LEVELS > 2 */
++
++#if PAGETABLE_LEVELS > 3
++static inline int pgd_present(pgd_t pgd)
++{
++      return pgd_flags(pgd) & _PAGE_PRESENT;
++}
++
++static inline unsigned long pgd_page_vaddr(pgd_t pgd)
++{
++      return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
++}
++
++/*
++ * Currently stuck as a macro due to indirect forward reference to
++ * linux/mmzone.h's __section_mem_map_addr() definition:
++ */
++#define pgd_page(pgd)         pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
++
++/* to find an entry in a page-table-directory. */
++static inline unsigned long pud_index(unsigned long address)
++{
++      return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
++}
++
++static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
++{
++      return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
++}
++
++static inline int pgd_bad(pgd_t pgd)
++{
++      return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
++}
++
++static inline int pgd_none(pgd_t pgd)
++{
++      return !__pgd_val(pgd);
++}
++#endif        /* PAGETABLE_LEVELS > 3 */
++
++#endif        /* __ASSEMBLY__ */
++
++/*
++ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
++ *
++ * this macro returns the index of the entry in the pgd page which would
++ * control the given virtual address
++ */
++#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
++
++/*
++ * pgd_offset() returns a (pgd_t *)
++ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
++ */
++#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
++/*
++ * a shortcut which implies the use of the kernel's pgd, instead
++ * of a process's
++ */
++#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
++
++
++#define KERNEL_PGD_BOUNDARY   pgd_index(PAGE_OFFSET)
++#define KERNEL_PGD_PTRS               (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
++
++#ifndef __ASSEMBLY__
++
++#define direct_gbpages 0
++
++/* local pte updates need not use xchg for locking */
++static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
++{
++      xen_set_pte(ptep, __pte(0));
++      return res;
++}
++
++static inline pmd_t xen_local_pmdp_get_and_clear(pmd_t *pmdp)
++{
++      pmd_t res = *pmdp;
++
++      xen_set_pmd(pmdp, __pmd(0));
++      return res;
++}
++
++static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
++                                pte_t *ptep , pte_t pte)
++{
++      if ((mm != current->mm && mm != &init_mm) ||
++          HYPERVISOR_update_va_mapping(addr, pte, 0))
++              xen_set_pte(ptep, pte);
++}
++
++static inline void xen_set_pmd_at(struct mm_struct *mm, unsigned long addr,
++                                pmd_t *pmdp , pmd_t pmd)
++{
++      xen_set_pmd(pmdp, pmd);
++}
++
++static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
++                               pte_t *ptep)
++{
++      if ((mm != current->mm && mm != &init_mm)
++          || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
++              __xen_pte_clear(ptep);
++}
++
++#ifndef CONFIG_PARAVIRT
++/*
++ * Rules for using pte_update - it must be called after any PTE update which
++ * has not been done using the set_pte / clear_pte interfaces.  It is used by
++ * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
++ * updates should either be sets, clears, or set_pte_atomic for P->P
++ * transitions, which means this hook should only be called for user PTEs.
++ * This hook implies a P->P protection or access change has taken place, which
++ * requires a subsequent TLB flush.  The notification can optionally be delayed
++ * until the TLB flush event by using the pte_update_defer form of the
++ * interface, but care must be taken to assure that the flush happens while
++ * still holding the same page table lock so that the shadow and primary pages
++ * do not become out of sync on SMP.
++ */
++#define pte_update(mm, addr, ptep)            do { } while (0)
++#define pte_update_defer(mm, addr, ptep)      do { } while (0)
++#endif
++
++/*
++ * We only update the dirty/accessed state if we set
++ * the dirty bit by hand in the kernel, since the hardware
++ * will do the accessed bit for us, and we don't want to
++ * race with other CPU's that might be updating the dirty
++ * bit at the same time.
++ */
++struct vm_area_struct;
++
++#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
++extern int ptep_set_access_flags(struct vm_area_struct *vma,
++                               unsigned long address, pte_t *ptep,
++                               pte_t entry, int dirty);
++
++#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
++extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
++                                   unsigned long addr, pte_t *ptep);
++
++#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
++extern int ptep_clear_flush_young(struct vm_area_struct *vma,
++                                unsigned long address, pte_t *ptep);
++
++#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
++#define ptep_clear_flush(vma, addr, ptep)                     \
++({                                                            \
++      pte_t *__ptep = (ptep);                                 \
++      pte_t __res = *__ptep;                                  \
++      if (!pte_none(__res) &&                                 \
++          ((vma)->vm_mm != current->mm ||                     \
++           HYPERVISOR_update_va_mapping(addr, __pte(0),       \
++                      uvm_multi(mm_cpumask((vma)->vm_mm)) |   \
++                              UVMF_INVLPG))) {                \
++              __xen_pte_clear(__ptep);                        \
++              flush_tlb_page(vma, addr);                      \
++      }                                                       \
++      __res;                                                  \
++})
++
++#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
++static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
++                                     pte_t *ptep)
++{
++      pte_t pte = *ptep;
++      if (!pte_none(pte)
++          && (mm != &init_mm
++              || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
++              pte = xen_ptep_get_and_clear(ptep, pte);
++              pte_update(mm, addr, ptep);
++      }
++      return pte;
++}
++
++#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
++#define ptep_get_and_clear_full(mm, addr, ptep, full)         \
++      ((full) ? ({                                            \
++              pte_t *__ptep = (ptep);                         \
++              pte_t __res = *__ptep;                          \
++              if (!PagePinned(virt_to_page((mm)->pgd)))       \
++                      __xen_pte_clear(__ptep);                \
++              else if (!pte_none(__res))                      \
++                      xen_l1_entry_update(__ptep, __pte(0));  \
++              __res;                                          \
++       }) :                                                   \
++       ptep_get_and_clear(mm, addr, ptep))
++
++pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
++
++#define __HAVE_ARCH_PTEP_SET_WRPROTECT
++static inline void ptep_set_wrprotect(struct mm_struct *mm,
++                                    unsigned long addr, pte_t *ptep)
++{
++      pte_t pte = *ptep;
++      if (pte_write(pte))
++              set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
++}
++
++#define flush_tlb_fix_spurious_fault(vma, address)
++
++#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
++
++#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
++extern int pmdp_set_access_flags(struct vm_area_struct *vma,
++                               unsigned long address, pmd_t *pmdp,
++                               pmd_t entry, int dirty);
++
++#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
++extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
++                                   unsigned long addr, pmd_t *pmdp);
++
++#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
++extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
++                                unsigned long address, pmd_t *pmdp);
++
++
++#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
++extern void pmdp_splitting_flush(struct vm_area_struct *vma,
++                               unsigned long addr, pmd_t *pmdp);
++
++#define __HAVE_ARCH_PMD_WRITE
++static inline int pmd_write(pmd_t pmd)
++{
++      return pmd_flags(pmd) & _PAGE_RW;
++}
++
++#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
++                                     pmd_t *pmdp)
++{
++      pmd_t pmd = xen_pmdp_get_and_clear(pmdp);
++      pmd_update(mm, addr, pmdp);
++      return pmd;
++}
++#endif
++
++#define __HAVE_ARCH_PMDP_SET_WRPROTECT
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++static inline void pmdp_set_wrprotect(struct mm_struct *mm,
++                                    unsigned long addr, pmd_t *pmdp)
++{
++      clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
++      pmd_update(mm, addr, pmdp);
++}
++#endif
++
++/*
++ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
++ *
++ *  dst - pointer to pgd range anwhere on a pgd page
++ *  src - ""
++ *  count - the number of pgds to copy.
++ *
++ * dst and src can be on the same page, but the range must not overlap,
++ * and must not cross a page boundary.
++ */
++static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
++{
++       memcpy(dst, src, count * sizeof(pgd_t));
++}
++
++#define arbitrary_virt_to_mfn(va)                                     \
++({                                                                    \
++      unsigned int __lvl;                                             \
++      pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl);    \
++      BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
++      pte_mfn(*__ptep);                                               \
++})
++
++#define arbitrary_virt_to_machine(va)                                 \
++      (((maddr_t)arbitrary_virt_to_mfn(va) << PAGE_SHIFT)             \
++       | ((unsigned long)(va) & (PAGE_SIZE - 1)))
++
++#ifdef CONFIG_HIGHPTE
++#include <asm/io.h>
++struct page *kmap_atomic_to_page(void *);
++#define ptep_to_machine(ptep)                                         \
++({                                                                    \
++      pte_t *__ptep = (ptep);                                         \
++      page_to_phys(kmap_atomic_to_page(__ptep))                       \
++              | ((unsigned long)__ptep & (PAGE_SIZE - 1));            \
++})
++#else
++#define ptep_to_machine(ptep) virt_to_machine(ptep)
++#endif
++
++#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
++static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
++                                         pte_t *ptep)
++{
++#if CONFIG_XEN_COMPAT < 0x030300
++      if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
++              return ptep_get_and_clear(mm, addr, ptep);
++#endif
++      return *ptep;
++}
++
++static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
++                                         pte_t *ptep, pte_t pte)
++{
++      mmu_update_t u;
++
++#if CONFIG_XEN_COMPAT < 0x030300
++      if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
++              set_pte_at(mm, addr, ptep, pte);
++              return;
++      }
++#endif
++      u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
++      u.val = __pte_val(pte);
++      if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
++              BUG();
++}
++
++#include <asm-generic/pgtable.h>
++
++#include <xen/features.h>
++void make_page_readonly(void *va, unsigned int feature);
++void make_page_writable(void *va, unsigned int feature);
++void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
++void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
++
++struct vm_area_struct;
++
++int direct_remap_pfn_range(struct vm_area_struct *vma,
++                           unsigned long address,
++                           phys_addr_t mfn,
++                           unsigned long size,
++                           pgprot_t prot,
++                           domid_t  domid);
++int direct_kernel_remap_pfn_range(unsigned long address,
++                                unsigned long mfn,
++                                unsigned long size,
++                                pgprot_t prot,
++                                domid_t  domid);
++int create_lookup_pte_addr(struct mm_struct *mm,
++                           unsigned long address,
++                           uint64_t *ptep);
++
++#endif        /* __ASSEMBLY__ */
++
++#endif /* _ASM_X86_PGTABLE_H */
diff --cc arch/x86/include/mach-xen/asm/pgtable_32.h

index 0000000,0000000..7d89873

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_32.h
@@@ -1,0 -1,0 +1,89 @@@
++#ifndef _ASM_X86_PGTABLE_32_H
++#define _ASM_X86_PGTABLE_32_H
++
++#include <asm/pgtable_32_types.h>
++
++/*
++ * The Linux memory management assumes a three-level page table setup. On
++ * the i386, we use that, but "fold" the mid level into the top-level page
++ * table, so that we physically have the same two-level page table as the
++ * i386 mmu expects.
++ *
++ * This file contains the functions and defines necessary to modify and use
++ * the i386 page table tree.
++ */
++#ifndef __ASSEMBLY__
++#include <asm/processor.h>
++#include <asm/fixmap.h>
++#include <linux/threads.h>
++
++#include <linux/bitops.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/sched.h>
++
++struct vm_area_struct;
++
++extern pgd_t *swapper_pg_dir;
++extern pgd_t initial_page_table[1024];
++
++static inline void pgtable_cache_init(void) { }
++static inline void check_pgt_cache(void) { }
++void paging_init(void);
++
++extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
++
++
++/*
++ * Define this if things work differently on an i386 and an i486:
++ * it will (on an i486) warn about kernel memory accesses that are
++ * done without a 'access_ok(VERIFY_WRITE,..)'
++ */
++#undef TEST_ACCESS_OK
++
++#ifdef CONFIG_X86_PAE
++# include <asm/pgtable-3level.h>
++#else
++# include <asm/pgtable-2level.h>
++#endif
++
++#if defined(CONFIG_HIGHPTE)
++#define pte_offset_map(dir, address)                                  \
++      ((pte_t *)kmap_atomic_pte(pmd_page(*(dir))) +           \
++       pte_index((address)))
++#define pte_unmap(pte) kunmap_atomic((pte))
++#else
++#define pte_offset_map(dir, address)                                  \
++      ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
++#define pte_unmap(pte) do { } while (0)
++#endif
++
++/* Clear a kernel PTE and flush it from the TLB */
++#define kpte_clear_flush(ptep, vaddr)                                 \
++do {                                                                  \
++      if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
++              BUG(); \
++} while (0)
++
++/*
++ * The i386 doesn't have any external MMU info: the kernel page
++ * tables contain all the necessary information.
++ */
++#define update_mmu_cache(vma, address, ptep) do { } while (0)
++
++void make_lowmem_page_readonly(void *va, unsigned int feature);
++void make_lowmem_page_writable(void *va, unsigned int feature);
++
++#endif /* !__ASSEMBLY__ */
++
++/*
++ * kern_addr_valid() is (1) for FLATMEM and (0) for
++ * SPARSEMEM and DISCONTIGMEM
++ */
++#ifdef CONFIG_FLATMEM
++#define kern_addr_valid(addr) (1)
++#else
++#define kern_addr_valid(kaddr)        (0)
++#endif
++
++#endif /* _ASM_X86_PGTABLE_32_H */
diff --cc arch/x86/include/mach-xen/asm/pgtable_64.h

index 0000000,0000000..f58b2ef

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_64.h
@@@ -1,0 -1,0 +1,203 @@@
++#ifndef _ASM_X86_PGTABLE_64_H
++#define _ASM_X86_PGTABLE_64_H
++
++#include <linux/const.h>
++#include <asm/pgtable_64_types.h>
++
++#ifndef __ASSEMBLY__
++
++/*
++ * This file contains the functions and defines necessary to modify and use
++ * the x86-64 page table tree.
++ */
++#include <asm/processor.h>
++#include <linux/bitops.h>
++#include <linux/threads.h>
++#include <linux/sched.h>
++
++#ifdef CONFIG_XEN
++extern pud_t level3_user_pgt[512];
++
++extern void xen_init_pt(void);
++extern void xen_switch_pt(void);
++#endif
++
++extern pud_t level3_kernel_pgt[512];
++extern pud_t level3_ident_pgt[512];
++extern pmd_t level2_kernel_pgt[512];
++extern pmd_t level2_fixmap_pgt[512];
++extern pmd_t level2_ident_pgt[512];
++extern pgd_t init_level4_pgt[];
++
++#define swapper_pg_dir init_level4_pgt
++
++extern void paging_init(void);
++
++#define pte_ERROR(e)                                                  \
++      printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n",               \
++             __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
++#define pmd_ERROR(e)                                                  \
++      printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n",               \
++             __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
++#define pud_ERROR(e)                                                  \
++      printk("%s:%d: bad pud %p(%016lx pfn %010Lx).\n",               \
++             __FILE__, __LINE__, &(e), __pud_val(e),                  \
++             (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
++#define pgd_ERROR(e)                                                  \
++      printk("%s:%d: bad pgd %p(%016lx pfn %010Lx).\n",               \
++             __FILE__, __LINE__, &(e), __pgd_val(e),                  \
++             (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
++
++struct mm_struct;
++
++void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
++
++
++#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
++
++static inline void xen_set_pte(pte_t *ptep, pte_t pte)
++{
++      *ptep = pte;
++}
++
++static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
++{
++      xen_l2_entry_update(pmdp, pmd);
++}
++
++#define xen_pmd_clear(pmd)                    \
++({                                            \
++      pmd_t *__pmdp = (pmd);                  \
++      PagePinned(virt_to_page(__pmdp))        \
++      ? set_pmd(__pmdp, xen_make_pmd(0))      \
++      : (void)(*__pmdp = xen_make_pmd(0));    \
++})
++
++#ifdef CONFIG_SMP
++static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
++{
++      return __pte_ma(xchg(&xp->pte, 0));
++}
++#else
++#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
++#endif
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++#ifdef CONFIG_SMP
++static inline pmd_t xen_pmdp_get_and_clear(pmd_t *xp)
++{
++      return xen_make_pmd(xchg(&xp->pmd, 0));
++}
++#else
++#define xen_pmdp_get_and_clear(xp) xen_local_pmdp_get_and_clear(xp)
++#endif
++#endif
++
++static inline void xen_set_pud(pud_t *pudp, pud_t pud)
++{
++      xen_l3_entry_update(pudp, pud);
++}
++
++#define xen_pud_clear(pud)                    \
++({                                            \
++      pud_t *__pudp = (pud);                  \
++      PagePinned(virt_to_page(__pudp))        \
++      ? set_pud(__pudp, xen_make_pud(0))      \
++      : (void)(*__pudp = xen_make_pud(0));    \
++})
++
++static inline pgd_t *__user_pgd(pgd_t *pgd)
++{
++      if (unlikely(((unsigned long)pgd & PAGE_MASK)
++                   == (unsigned long)init_level4_pgt))
++              return NULL;
++      return (pgd_t *)(virt_to_page(pgd)->private
++                       + ((unsigned long)pgd & ~PAGE_MASK));
++}
++
++static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
++{
++      xen_l4_entry_update(pgdp, pgd);
++}
++
++#define xen_pgd_clear(pgd)                    \
++({                                            \
++      pgd_t *__pgdp = (pgd);                  \
++      PagePinned(virt_to_page(__pgdp))        \
++      ? xen_l4_entry_update(__pgdp, xen_make_pgd(0)) \
++      : (void)(*__user_pgd(__pgdp) = *__pgdp = xen_make_pgd(0)); \
++})
++
++#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
++
++extern unsigned long early_arbitrary_virt_to_mfn(void *va);
++
++extern void sync_global_pgds(unsigned long start, unsigned long end);
++
++/*
++ * Conversion functions: convert a page and protection to a page entry,
++ * and a page entry and page directory to the page they refer to.
++ */
++
++/*
++ * Level 4 access.
++ */
++static inline int pgd_large(pgd_t pgd) { return 0; }
++#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
++
++/* PUD - Level3 access */
++
++/* PMD  - Level 2 access */
++#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
++#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) |   \
++                                          _PAGE_FILE })
++#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
++
++/* PTE - Level 1 access. */
++
++/* x86-64 always has all page tables mapped. */
++#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
++#define pte_unmap(pte) ((void)(pte))/* NOP */
++
++#define update_mmu_cache(vma, address, ptep) do { } while (0)
++
++/* Encode and de-code a swap entry */
++#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
++#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
++#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
++#else
++#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
++#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
++#endif
++
++#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
++
++#define __swp_type(x)                 (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
++                                       & ((1U << SWP_TYPE_BITS) - 1))
++#define __swp_offset(x)                       ((x).val >> SWP_OFFSET_SHIFT)
++#define __swp_entry(type, offset)     ((swp_entry_t) { \
++                                       ((type) << (_PAGE_BIT_PRESENT + 1)) \
++                                       | ((offset) << SWP_OFFSET_SHIFT) })
++#define __pte_to_swp_entry(pte)               ((swp_entry_t) { __pte_val(pte) })
++#define __swp_entry_to_pte(x)         ((pte_t) { .pte = (x).val })
++
++extern int kern_addr_valid(unsigned long addr);
++
++#define HAVE_ARCH_UNMAPPED_AREA
++#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
++
++#define pgtable_cache_init()   do { } while (0)
++#define check_pgt_cache()      do { } while (0)
++
++#define PAGE_AGP    PAGE_KERNEL_NOCACHE
++#define HAVE_PAGE_AGP 1
++
++/* fs/proc/kcore.c */
++#define       kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
++#define       kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
++
++#define __HAVE_ARCH_PTE_SAME
++
++#endif /* !__ASSEMBLY__ */
++
++#endif /* _ASM_X86_PGTABLE_64_H */
diff --cc arch/x86/include/mach-xen/asm/pgtable_64_types.h

index 0000000,0000000..c4c4665

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_64_types.h
@@@ -1,0 -1,0 +1,64 @@@
++#ifndef _ASM_X86_PGTABLE_64_DEFS_H
++#define _ASM_X86_PGTABLE_64_DEFS_H
++
++#ifndef __ASSEMBLY__
++#include <linux/types.h>
++
++/*
++ * These are used to make use of C type-checking..
++ */
++typedef unsigned long pteval_t;
++typedef unsigned long pmdval_t;
++typedef unsigned long pudval_t;
++typedef unsigned long pgdval_t;
++typedef unsigned long pgprotval_t;
++
++typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
++
++#endif        /* !__ASSEMBLY__ */
++
++#define SHARED_KERNEL_PMD     0
++#define PAGETABLE_LEVELS      4
++
++/*
++ * PGDIR_SHIFT determines what a top-level page table entry can map
++ */
++#define PGDIR_SHIFT   39
++#define PTRS_PER_PGD  512
++
++/*
++ * 3rd level page
++ */
++#define PUD_SHIFT     30
++#define PTRS_PER_PUD  512
++
++/*
++ * PMD_SHIFT determines the size of the area a middle-level
++ * page table can map
++ */
++#define PMD_SHIFT     21
++#define PTRS_PER_PMD  512
++
++/*
++ * entries per page directory level
++ */
++#define PTRS_PER_PTE  512
++
++#define PMD_SIZE      (_AC(1, UL) << PMD_SHIFT)
++#define PMD_MASK      (~(PMD_SIZE - 1))
++#define PUD_SIZE      (_AC(1, UL) << PUD_SHIFT)
++#define PUD_MASK      (~(PUD_SIZE - 1))
++#define PGDIR_SIZE    (_AC(1, UL) << PGDIR_SHIFT)
++#define PGDIR_MASK    (~(PGDIR_SIZE - 1))
++
++/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
++#define MAX_PHYSMEM_BITS 43
++#define MAXMEM                 _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
++#define VMALLOC_START    _AC(0xffffc90000000000, UL)
++#define VMALLOC_END      _AC(0xffffe8ffffffffff, UL)
++#define VMEMMAP_START  _AC(0xffffea0000000000, UL)
++#define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
++#define MODULES_END      _AC(0xffffffffff000000, UL)
++#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
++
++#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --cc arch/x86/include/mach-xen/asm/pgtable_types.h

index 0000000,0000000..6bee8e5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_types.h
@@@ -1,0 -1,0 +1,390 @@@
++#ifndef _ASM_X86_PGTABLE_DEFS_H
++#define _ASM_X86_PGTABLE_DEFS_H
++
++#include <linux/const.h>
++#include <asm/page_types.h>
++
++#define FIRST_USER_ADDRESS    0
++
++#define _PAGE_BIT_PRESENT     0       /* is present */
++#define _PAGE_BIT_RW          1       /* writeable */
++#define _PAGE_BIT_USER                2       /* userspace addressable */
++#define _PAGE_BIT_PWT         3       /* page write through */
++#define _PAGE_BIT_PCD         4       /* page cache disabled */
++#define _PAGE_BIT_ACCESSED    5       /* was accessed (raised by CPU) */
++#define _PAGE_BIT_DIRTY               6       /* was written to (raised by CPU) */
++#define _PAGE_BIT_PSE         7       /* 4 MB (or 2MB) page */
++#define _PAGE_BIT_PAT         7       /* on 4KB pages */
++#define _PAGE_BIT_GLOBAL      8       /* Global TLB entry PPro+ */
++#define _PAGE_BIT_UNUSED1     9       /* available for programmer */
++#define _PAGE_BIT_IOMAP               10      /* flag used to indicate IO mapping */
++#define _PAGE_BIT_HIDDEN      11      /* hidden by kmemcheck */
++#define _PAGE_BIT_PAT_LARGE   12      /* On 2MB or 1GB pages */
++#define _PAGE_BIT_SPECIAL     _PAGE_BIT_UNUSED1
++#define _PAGE_BIT_CPA_TEST    _PAGE_BIT_UNUSED1
++#define _PAGE_BIT_SPLITTING   _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
++#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
++
++/* If _PAGE_BIT_PRESENT is clear, we use these: */
++/* - if the user mapped it with PROT_NONE; pte_present gives true */
++#define _PAGE_BIT_PROTNONE    _PAGE_BIT_GLOBAL
++/* - set: nonlinear file mapping, saved PTE; unset:swap */
++#define _PAGE_BIT_FILE                _PAGE_BIT_DIRTY
++
++#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
++#define _PAGE_RW      (_AT(pteval_t, 1) << _PAGE_BIT_RW)
++#define _PAGE_USER    (_AT(pteval_t, 1) << _PAGE_BIT_USER)
++#define _PAGE_PWT     (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
++#define _PAGE_PCD     (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
++#define _PAGE_ACCESSED        (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
++#define _PAGE_DIRTY   (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
++#define _PAGE_PSE     (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
++#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
++#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
++#define _PAGE_IOMAP   (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
++#define _PAGE_PAT     (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
++#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
++#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
++#define _PAGE_CPA_TEST        (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
++#define _PAGE_SPLITTING       (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
++#define __HAVE_ARCH_PTE_SPECIAL
++
++#ifdef CONFIG_KMEMCHECK
++#define _PAGE_HIDDEN  (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
++#else
++#define _PAGE_HIDDEN  (_AT(pteval_t, 0))
++#endif
++
++#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
++#define _PAGE_NX      (_AT(pteval_t, 1) << _PAGE_BIT_NX)
++#else
++#define _PAGE_NX      (_AT(pteval_t, 0))
++#endif
++
++#define _PAGE_FILE    (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
++#define _PAGE_PROTNONE        (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
++
++#ifndef __ASSEMBLY__
++#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
++extern unsigned int __kernel_page_user;
++#else
++#define __kernel_page_user 0
++#endif
++#endif
++
++#define _PAGE_TABLE   (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
++                       _PAGE_ACCESSED | _PAGE_DIRTY)
++#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |    \
++                       _PAGE_DIRTY | __kernel_page_user)
++
++/* Set of bits not changed in pte_modify */
++#define _PAGE_CHG_MASK        (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \
++                       _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
++#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
++
++/*
++ * PAT settings are part of the hypervisor interface, which sets the
++ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
++ */
++#define _PAGE_CACHE_MASK      (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
++#define _PAGE_CACHE_WB                (0)
++#define _PAGE_CACHE_WT                (_PAGE_PWT)
++#define _PAGE_CACHE_WC                (_PAGE_PAT)
++#define _PAGE_CACHE_WP                (_PAGE_PAT | _PAGE_PWT)
++#define _PAGE_CACHE_UC_MINUS  (_PAGE_PCD)
++#define _PAGE_CACHE_UC                (_PAGE_PCD | _PAGE_PWT)
++
++#define PAGE_NONE     __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
++#define PAGE_SHARED   __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
++                               _PAGE_ACCESSED | _PAGE_NX)
++
++#define PAGE_SHARED_EXEC      __pgprot(_PAGE_PRESENT | _PAGE_RW |     \
++                                       _PAGE_USER | _PAGE_ACCESSED)
++#define PAGE_COPY_NOEXEC      __pgprot(_PAGE_PRESENT | _PAGE_USER |   \
++                                       _PAGE_ACCESSED | _PAGE_NX)
++#define PAGE_COPY_EXEC                __pgprot(_PAGE_PRESENT | _PAGE_USER |   \
++                                       _PAGE_ACCESSED)
++#define PAGE_COPY             PAGE_COPY_NOEXEC
++#define PAGE_READONLY         __pgprot(_PAGE_PRESENT | _PAGE_USER |   \
++                                       _PAGE_ACCESSED | _PAGE_NX)
++#define PAGE_READONLY_EXEC    __pgprot(_PAGE_PRESENT | _PAGE_USER |   \
++                                       _PAGE_ACCESSED)
++
++#define __PAGE_KERNEL_EXEC                                            \
++      (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
++#define __PAGE_KERNEL         (__PAGE_KERNEL_EXEC | _PAGE_NX)
++
++#define __PAGE_KERNEL_RO              (__PAGE_KERNEL & ~_PAGE_RW)
++#define __PAGE_KERNEL_RX              (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
++#define __PAGE_KERNEL_EXEC_NOCACHE    (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_KERNEL_WC              (__PAGE_KERNEL | _PAGE_CACHE_WC)
++#define __PAGE_KERNEL_NOCACHE         (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_KERNEL_UC_MINUS                (__PAGE_KERNEL | _PAGE_PCD)
++#define __PAGE_KERNEL_VSYSCALL                (__PAGE_KERNEL_RX | _PAGE_USER)
++#define __PAGE_KERNEL_VSYSCALL_NOCACHE        (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
++#define __PAGE_KERNEL_LARGE           (__PAGE_KERNEL | _PAGE_PSE)
++#define __PAGE_KERNEL_LARGE_NOCACHE   (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
++#define __PAGE_KERNEL_LARGE_EXEC      (__PAGE_KERNEL_EXEC | _PAGE_PSE)
++
++#define __PAGE_KERNEL_IO              (__PAGE_KERNEL | _PAGE_IOMAP)
++#define __PAGE_KERNEL_IO_NOCACHE      (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
++#define __PAGE_KERNEL_IO_UC_MINUS     (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
++#define __PAGE_KERNEL_IO_WC           (__PAGE_KERNEL_WC | _PAGE_IOMAP)
++
++#define PAGE_KERNEL                   __pgprot(__PAGE_KERNEL)
++#define PAGE_KERNEL_RO                        __pgprot(__PAGE_KERNEL_RO)
++#define PAGE_KERNEL_EXEC              __pgprot(__PAGE_KERNEL_EXEC)
++#define PAGE_KERNEL_RX                        __pgprot(__PAGE_KERNEL_RX)
++#define PAGE_KERNEL_WC                        __pgprot(__PAGE_KERNEL_WC)
++#define PAGE_KERNEL_NOCACHE           __pgprot(__PAGE_KERNEL_NOCACHE)
++#define PAGE_KERNEL_UC_MINUS          __pgprot(__PAGE_KERNEL_UC_MINUS)
++#define PAGE_KERNEL_EXEC_NOCACHE      __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
++#define PAGE_KERNEL_LARGE             __pgprot(__PAGE_KERNEL_LARGE)
++#define PAGE_KERNEL_LARGE_NOCACHE     __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
++#define PAGE_KERNEL_LARGE_EXEC                __pgprot(__PAGE_KERNEL_LARGE_EXEC)
++#define PAGE_KERNEL_VSYSCALL          __pgprot(__PAGE_KERNEL_VSYSCALL)
++#define PAGE_KERNEL_VSYSCALL_NOCACHE  __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
++
++#define PAGE_KERNEL_IO                        __pgprot(__PAGE_KERNEL_IO)
++#define PAGE_KERNEL_IO_NOCACHE                __pgprot(__PAGE_KERNEL_IO_NOCACHE)
++#define PAGE_KERNEL_IO_UC_MINUS               __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
++#define PAGE_KERNEL_IO_WC             __pgprot(__PAGE_KERNEL_IO_WC)
++
++/*         xwr */
++#define __P000        PAGE_NONE
++#define __P001        PAGE_READONLY
++#define __P010        PAGE_COPY
++#define __P011        PAGE_COPY
++#define __P100        PAGE_READONLY_EXEC
++#define __P101        PAGE_READONLY_EXEC
++#define __P110        PAGE_COPY_EXEC
++#define __P111        PAGE_COPY_EXEC
++
++#define __S000        PAGE_NONE
++#define __S001        PAGE_READONLY
++#define __S010        PAGE_SHARED
++#define __S011        PAGE_SHARED
++#define __S100        PAGE_READONLY_EXEC
++#define __S101        PAGE_READONLY_EXEC
++#define __S110        PAGE_SHARED_EXEC
++#define __S111        PAGE_SHARED_EXEC
++
++/*
++ * early identity mapping  pte attrib macros.
++ */
++#ifdef CONFIG_X86_64
++#define __PAGE_KERNEL_IDENT_LARGE_EXEC        __PAGE_KERNEL_LARGE_EXEC
++#else
++/*
++ * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
++ * bits are combined, this will alow user to access the high address mapped
++ * VDSO in the presence of CONFIG_COMPAT_VDSO
++ */
++#define PTE_IDENT_ATTR         0x003          /* PRESENT+RW */
++#define PDE_IDENT_ATTR         0x067          /* PRESENT+RW+USER+DIRTY+ACCESSED */
++#define PGD_IDENT_ATTR         0x001          /* PRESENT (no other attributes) */
++#endif
++
++#ifdef CONFIG_X86_32
++# include <asm/pgtable_32_types.h>
++#else
++# include "pgtable_64_types.h"
++#endif
++
++#ifndef __ASSEMBLY__
++
++#include <linux/types.h>
++
++/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
++#define PTE_PFN_MASK          ((pteval_t)PHYSICAL_PAGE_MASK)
++
++/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
++#define PTE_FLAGS_MASK                (~PTE_PFN_MASK)
++
++typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
++
++#include <asm/maddr.h>
++
++typedef struct { pgdval_t pgd; } pgd_t;
++
++#define __pgd_ma(x) ((pgd_t) { (x) } )
++static inline pgd_t xen_make_pgd(pgdval_t val)
++{
++      if (likely(val & _PAGE_PRESENT))
++              val = pte_phys_to_machine(val);
++      return (pgd_t) { val };
++}
++
++#define __pgd_val(x) ((x).pgd)
++static inline pgdval_t xen_pgd_val(pgd_t pgd)
++{
++      pgdval_t ret = __pgd_val(pgd);
++#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
++      if (likely(ret))
++              ret = machine_to_phys(ret) | _PAGE_PRESENT;
++#else
++      if (likely(ret & _PAGE_PRESENT))
++              ret = pte_machine_to_phys(ret);
++#endif
++      return ret;
++}
++
++static inline pgdval_t pgd_flags(pgd_t pgd)
++{
++      return __pgd_val(pgd) & PTE_FLAGS_MASK;
++}
++
++#if PAGETABLE_LEVELS > 3
++typedef struct { pudval_t pud; } pud_t;
++
++#define __pud_ma(x) ((pud_t) { (x) } )
++static inline pud_t xen_make_pud(pudval_t val)
++{
++      if (likely(val & _PAGE_PRESENT))
++              val = pte_phys_to_machine(val);
++      return (pud_t) { val };
++}
++
++#define __pud_val(x) ((x).pud)
++static inline pudval_t xen_pud_val(pud_t pud)
++{
++      pudval_t ret = __pud_val(pud);
++      if (likely(ret & _PAGE_PRESENT))
++              ret = pte_machine_to_phys(ret);
++      return ret;
++}
++#else
++#include <asm-generic/pgtable-nopud.h>
++
++#define __pud_val(x) __pgd_val((x).pgd)
++static inline pudval_t xen_pud_val(pud_t pud)
++{
++      return xen_pgd_val(pud.pgd);
++}
++#endif
++
++#if PAGETABLE_LEVELS > 2
++typedef struct { pmdval_t pmd; } pmd_t;
++
++#define __pmd_ma(x)   ((pmd_t) { (x) } )
++static inline pmd_t xen_make_pmd(pmdval_t val)
++{
++      if (likely(val & _PAGE_PRESENT))
++              val = pte_phys_to_machine(val);
++      return (pmd_t) { val };
++}
++
++#define __pmd_val(x) ((x).pmd)
++static inline pmdval_t xen_pmd_val(pmd_t pmd)
++{
++      pmdval_t ret = __pmd_val(pmd);
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (likely(ret))
++              ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
++#else
++      if (likely(ret & _PAGE_PRESENT))
++              ret = pte_machine_to_phys(ret);
++#endif
++      return ret;
++}
++#else
++#include <asm-generic/pgtable-nopmd.h>
++
++#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
++#define __pmd_val(x) __pgd_val((x).pud.pgd)
++static inline pmdval_t xen_pmd_val(pmd_t pmd)
++{
++      return xen_pgd_val(pmd.pud.pgd);
++}
++#endif
++
++static inline pudval_t pud_flags(pud_t pud)
++{
++      return __pud_val(pud) & PTE_FLAGS_MASK;
++}
++
++static inline pmdval_t pmd_flags(pmd_t pmd)
++{
++      return __pmd_val(pmd) & PTE_FLAGS_MASK;
++}
++
++#define __pte_ma(x) ((pte_t) { .pte = (x) } )
++static inline pte_t xen_make_pte(pteval_t val)
++{
++      if (likely((val & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT))
++              val = pte_phys_to_machine(val);
++      return (pte_t) { .pte = val };
++}
++
++#define __pte_val(x) ((x).pte)
++static inline pteval_t xen_pte_val(pte_t pte)
++{
++      pteval_t ret = __pte_val(pte);
++      if (likely((pte.pte_low & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT))
++              ret = pte_machine_to_phys(ret);
++      return ret;
++}
++
++static inline pteval_t pte_flags(pte_t pte)
++{
++      return __pte_val(pte) & PTE_FLAGS_MASK;
++}
++
++#define pgprot_val(x) ((x).pgprot)
++#define __pgprot(x)   ((pgprot_t) { (x) } )
++
++
++typedef struct page *pgtable_t;
++
++extern pteval_t __supported_pte_mask;
++extern void set_nx(void);
++extern int nx_enabled;
++
++#define pgprot_writecombine   pgprot_writecombine
++extern pgprot_t pgprot_writecombine(pgprot_t prot);
++
++#ifndef CONFIG_XEN
++/* Indicate that x86 has its own track and untrack pfn vma functions */
++#define __HAVE_PFNMAP_TRACKING
++#endif
++
++#define __HAVE_PHYS_MEM_ACCESS_PROT
++struct file;
++pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
++                              unsigned long size, pgprot_t vma_prot);
++int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
++                              unsigned long size, pgprot_t *vma_prot);
++
++/* Install a pte for a particular vaddr in kernel space. */
++void set_pte_vaddr(unsigned long vaddr, pte_t pte);
++
++extern void xen_pagetable_reserve(u64 start, u64 end);
++
++struct seq_file;
++extern void arch_report_meminfo(struct seq_file *m);
++
++enum {
++      PG_LEVEL_NONE,
++      PG_LEVEL_4K,
++      PG_LEVEL_2M,
++      PG_LEVEL_1G,
++      PG_LEVEL_NUM
++};
++
++#ifdef CONFIG_PROC_FS
++extern void update_page_count(int level, unsigned long pages);
++#else
++static inline void update_page_count(int level, unsigned long pages) { }
++#endif
++
++/*
++ * Helper function that returns the kernel pagetable entry controlling
++ * the virtual address 'address'. NULL means no pagetable entry present.
++ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
++ * as a pte too.
++ */
++extern pte_t *lookup_address(unsigned long address, unsigned int *level);
++
++#endif        /* !__ASSEMBLY__ */
++
++#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --cc arch/x86/include/mach-xen/asm/processor.h

index 0000000,0000000..00779cf

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/processor.h
@@@ -1,0 -1,0 +1,976 @@@
++#ifndef _ASM_X86_PROCESSOR_H
++#define _ASM_X86_PROCESSOR_H
++
++#include <asm/processor-flags.h>
++
++/* Forward declaration, a strange C thing */
++struct task_struct;
++struct mm_struct;
++
++#include <asm/vm86.h>
++#include <asm/math_emu.h>
++#include <asm/segment.h>
++#include <asm/types.h>
++#include <asm/sigcontext.h>
++#include <asm/current.h>
++#include <asm/cpufeature.h>
++#include <asm/system.h>
++#include <asm/page.h>
++#include <asm/pgtable_types.h>
++#include <asm/percpu.h>
++#include <asm/msr.h>
++#include <asm/desc_defs.h>
++#include <asm/nops.h>
++
++#include <linux/personality.h>
++#include <linux/cpumask.h>
++#include <linux/cache.h>
++#include <linux/threads.h>
++#include <linux/math64.h>
++#include <linux/init.h>
++#include <linux/err.h>
++
++#include <xen/interface/physdev.h>
++
++#define HBP_NUM 4
++/*
++ * Default implementation of macro that returns current
++ * instruction pointer ("program counter").
++ */
++static inline void *current_text_addr(void)
++{
++      void *pc;
++
++      asm volatile("mov $1f, %0; 1:":"=r" (pc));
++
++      return pc;
++}
++
++#ifdef CONFIG_X86_VSMP
++# define ARCH_MIN_TASKALIGN           (1 << INTERNODE_CACHE_SHIFT)
++# define ARCH_MIN_MMSTRUCT_ALIGN      (1 << INTERNODE_CACHE_SHIFT)
++#else
++# define ARCH_MIN_TASKALIGN           16
++# define ARCH_MIN_MMSTRUCT_ALIGN      0
++#endif
++
++/*
++ *  CPU type and hardware bug flags. Kept separately for each CPU.
++ *  Members of this structure are referenced in head.S, so think twice
++ *  before touching them. [mj]
++ */
++
++struct cpuinfo_x86 {
++      __u8                    x86;            /* CPU family */
++      __u8                    x86_vendor;     /* CPU vendor */
++      __u8                    x86_model;
++      __u8                    x86_mask;
++#ifdef CONFIG_X86_32
++      char                    wp_works_ok;    /* It doesn't on 386's */
++
++      /* Problems on some 486Dx4's and old 386's: */
++#ifndef CONFIG_XEN
++      char                    hlt_works_ok;
++#endif
++      char                    hard_math;
++#ifndef CONFIG_XEN
++      char                    rfu;
++      char                    fdiv_bug;
++      char                    f00f_bug;
++      char                    coma_bug;
++      char                    pad0;
++#endif
++#else
++      /* Number of 4K pages in DTLB/ITLB combined(in pages): */
++      int                     x86_tlbsize;
++#endif
++      __u8                    x86_virt_bits;
++      __u8                    x86_phys_bits;
++#ifndef CONFIG_XEN
++      /* CPUID returned core id bits: */
++      __u8                    x86_coreid_bits;
++#endif
++      /* Max extended CPUID function supported: */
++      __u32                   extended_cpuid_level;
++      /* Maximum supported CPUID level, -1=no CPUID: */
++      int                     cpuid_level;
++      __u32                   x86_capability[NCAPINTS];
++      char                    x86_vendor_id[16];
++      char                    x86_model_id[64];
++      /* in KB - valid for CPUS which support this call: */
++      int                     x86_cache_size;
++      int                     x86_cache_alignment;    /* In bytes */
++      int                     x86_power;
++      unsigned long           loops_per_jiffy;
++#ifndef CONFIG_XEN
++      /* cpuid returned max cores value: */
++      u16                      x86_max_cores;
++      u16                     apicid;
++      u16                     initial_apicid;
++#endif
++      u16                     x86_clflush_size;
++#ifdef CONFIG_X86_HT
++      /* number of cores as seen by the OS: */
++      u16                     booted_cores;
++      /* Physical processor id: */
++      u16                     phys_proc_id;
++      /* Core id: */
++      u16                     cpu_core_id;
++      /* Compute unit id */
++      u8                      compute_unit_id;
++#endif
++#ifdef CONFIG_SMP
++      /* Index into per_cpu list: */
++      u16                     cpu_index;
++#endif
++} __attribute__((__aligned__(SMP_CACHE_BYTES)));
++
++#define X86_VENDOR_INTEL      0
++#define X86_VENDOR_CYRIX      1
++#define X86_VENDOR_AMD                2
++#define X86_VENDOR_UMC                3
++#define X86_VENDOR_CENTAUR    5
++#define X86_VENDOR_TRANSMETA  7
++#define X86_VENDOR_NSC                8
++#define X86_VENDOR_NUM                9
++
++#define X86_VENDOR_UNKNOWN    0xff
++
++/*
++ * capabilities of CPUs
++ */
++extern struct cpuinfo_x86     boot_cpu_data;
++extern struct cpuinfo_x86     new_cpu_data;
++
++extern __u32                  cpu_caps_cleared[NCAPINTS];
++extern __u32                  cpu_caps_set[NCAPINTS];
++
++#ifdef CONFIG_SMP
++DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
++#define cpu_data(cpu)         per_cpu(cpu_info, cpu)
++#else
++#define cpu_info              boot_cpu_data
++#define cpu_data(cpu)         boot_cpu_data
++#endif
++
++extern const struct seq_operations cpuinfo_op;
++
++static inline int hlt_works(int cpu)
++{
++#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
++      return cpu_data(cpu).hlt_works_ok;
++#else
++      return 1;
++#endif
++}
++
++#define cache_line_size()     (boot_cpu_data.x86_cache_alignment)
++
++extern void cpu_detect(struct cpuinfo_x86 *c);
++
++extern struct pt_regs *idle_regs(struct pt_regs *);
++
++extern void early_cpu_init(void);
++extern void identify_boot_cpu(void);
++extern void identify_secondary_cpu(struct cpuinfo_x86 *);
++extern void print_cpu_info(struct cpuinfo_x86 *);
++extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
++extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
++extern unsigned short num_cache_leaves;
++
++extern void detect_extended_topology(struct cpuinfo_x86 *c);
++extern void detect_ht(struct cpuinfo_x86 *c);
++
++static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
++                           unsigned int *ecx, unsigned int *edx)
++{
++      /* ecx is often an input as well as an output. */
++      asm volatile(XEN_CPUID
++          : "=a" (*eax),
++            "=b" (*ebx),
++            "=c" (*ecx),
++            "=d" (*edx)
++          : "0" (*eax), "2" (*ecx));
++}
++
++static inline void load_cr3(pgd_t *pgdir)
++{
++      write_cr3(__pa(pgdir));
++}
++
++#ifndef CONFIG_X86_NO_TSS
++#ifdef CONFIG_X86_32
++/* This is the TSS defined by the hardware. */
++struct x86_hw_tss {
++      unsigned short          back_link, __blh;
++      unsigned long           sp0;
++      unsigned short          ss0, __ss0h;
++      unsigned long           sp1;
++      /* ss1 caches MSR_IA32_SYSENTER_CS: */
++      unsigned short          ss1, __ss1h;
++      unsigned long           sp2;
++      unsigned short          ss2, __ss2h;
++      unsigned long           __cr3;
++      unsigned long           ip;
++      unsigned long           flags;
++      unsigned long           ax;
++      unsigned long           cx;
++      unsigned long           dx;
++      unsigned long           bx;
++      unsigned long           sp;
++      unsigned long           bp;
++      unsigned long           si;
++      unsigned long           di;
++      unsigned short          es, __esh;
++      unsigned short          cs, __csh;
++      unsigned short          ss, __ssh;
++      unsigned short          ds, __dsh;
++      unsigned short          fs, __fsh;
++      unsigned short          gs, __gsh;
++      unsigned short          ldt, __ldth;
++      unsigned short          trace;
++      unsigned short          io_bitmap_base;
++
++} __attribute__((packed));
++extern struct tss_struct doublefault_tss;
++#else
++struct x86_hw_tss {
++      u32                     reserved1;
++      u64                     sp0;
++      u64                     sp1;
++      u64                     sp2;
++      u64                     reserved2;
++      u64                     ist[7];
++      u32                     reserved3;
++      u32                     reserved4;
++      u16                     reserved5;
++      u16                     io_bitmap_base;
++
++} __attribute__((packed)) ____cacheline_aligned;
++#endif
++#endif /* CONFIG_X86_NO_TSS */
++
++/*
++ * IO-bitmap sizes:
++ */
++#define IO_BITMAP_BITS                        65536
++#define IO_BITMAP_BYTES                       (IO_BITMAP_BITS/8)
++#define IO_BITMAP_LONGS                       (IO_BITMAP_BYTES/sizeof(long))
++#define IO_BITMAP_OFFSET              offsetof(struct tss_struct, io_bitmap)
++#define INVALID_IO_BITMAP_OFFSET      0x8000
++
++#ifndef CONFIG_X86_NO_TSS
++struct tss_struct {
++      /*
++       * The hardware state:
++       */
++      struct x86_hw_tss       x86_tss;
++
++      /*
++       * The extra 1 is there because the CPU will access an
++       * additional byte beyond the end of the IO permission
++       * bitmap. The extra byte must be all 1 bits, and must
++       * be within the limit.
++       */
++      unsigned long           io_bitmap[IO_BITMAP_LONGS + 1];
++
++      /*
++       * .. and then another 0x100 bytes for the emergency kernel stack:
++       */
++      unsigned long           stack[64];
++
++} ____cacheline_aligned;
++
++DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
++
++/*
++ * Save the original ist values for checking stack pointers during debugging
++ */
++struct orig_ist {
++      unsigned long           ist[7];
++};
++#endif /* CONFIG_X86_NO_TSS */
++
++#define       MXCSR_DEFAULT           0x1f80
++
++struct i387_fsave_struct {
++      u32                     cwd;    /* FPU Control Word             */
++      u32                     swd;    /* FPU Status Word              */
++      u32                     twd;    /* FPU Tag Word                 */
++      u32                     fip;    /* FPU IP Offset                */
++      u32                     fcs;    /* FPU IP Selector              */
++      u32                     foo;    /* FPU Operand Pointer Offset   */
++      u32                     fos;    /* FPU Operand Pointer Selector */
++
++      /* 8*10 bytes for each FP-reg = 80 bytes:                       */
++      u32                     st_space[20];
++
++      /* Software status information [not touched by FSAVE ]:         */
++      u32                     status;
++};
++
++struct i387_fxsave_struct {
++      u16                     cwd; /* Control Word                    */
++      u16                     swd; /* Status Word                     */
++      u16                     twd; /* Tag Word                        */
++      u16                     fop; /* Last Instruction Opcode         */
++      union {
++              struct {
++                      u64     rip; /* Instruction Pointer             */
++                      u64     rdp; /* Data Pointer                    */
++              };
++              struct {
++                      u32     fip; /* FPU IP Offset                   */
++                      u32     fcs; /* FPU IP Selector                 */
++                      u32     foo; /* FPU Operand Offset              */
++                      u32     fos; /* FPU Operand Selector            */
++              };
++      };
++      u32                     mxcsr;          /* MXCSR Register State */
++      u32                     mxcsr_mask;     /* MXCSR Mask           */
++
++      /* 8*16 bytes for each FP-reg = 128 bytes:                      */
++      u32                     st_space[32];
++
++      /* 16*16 bytes for each XMM-reg = 256 bytes:                    */
++      u32                     xmm_space[64];
++
++      u32                     padding[12];
++
++      union {
++              u32             padding1[12];
++              u32             sw_reserved[12];
++      };
++
++} __attribute__((aligned(16)));
++
++struct i387_soft_struct {
++      u32                     cwd;
++      u32                     swd;
++      u32                     twd;
++      u32                     fip;
++      u32                     fcs;
++      u32                     foo;
++      u32                     fos;
++      /* 8*10 bytes for each FP-reg = 80 bytes: */
++      u32                     st_space[20];
++      u8                      ftop;
++      u8                      changed;
++      u8                      lookahead;
++      u8                      no_update;
++      u8                      rm;
++      u8                      alimit;
++      struct math_emu_info    *info;
++      u32                     entry_eip;
++};
++
++struct ymmh_struct {
++      /* 16 * 16 bytes for each YMMH-reg = 256 bytes */
++      u32 ymmh_space[64];
++};
++
++struct xsave_hdr_struct {
++      u64 xstate_bv;
++      u64 reserved1[2];
++      u64 reserved2[5];
++} __attribute__((packed));
++
++struct xsave_struct {
++      struct i387_fxsave_struct i387;
++      struct xsave_hdr_struct xsave_hdr;
++      struct ymmh_struct ymmh;
++      /* new processor state extensions will go here */
++} __attribute__ ((packed, aligned (64)));
++
++union thread_xstate {
++      struct i387_fsave_struct        fsave;
++      struct i387_fxsave_struct       fxsave;
++      struct i387_soft_struct         soft;
++      struct xsave_struct             xsave;
++};
++
++struct fpu {
++      union thread_xstate *state;
++};
++
++#ifdef CONFIG_X86_64
++#ifndef CONFIG_X86_NO_TSS
++DECLARE_PER_CPU(struct orig_ist, orig_ist);
++#endif
++
++union irq_stack_union {
++      char irq_stack[IRQ_STACK_SIZE];
++      /*
++       * GCC hardcodes the stack canary as %gs:40.  Since the
++       * irq_stack is the object at %gs:0, we reserve the bottom
++       * 48 bytes of the irq stack for the canary.
++       */
++      struct {
++              char gs_base[40];
++              unsigned long stack_canary;
++      };
++};
++
++DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union);
++DECLARE_INIT_PER_CPU(irq_stack_union);
++
++DECLARE_PER_CPU(char *, irq_stack_ptr);
++DECLARE_PER_CPU(unsigned int, irq_count);
++extern unsigned long kernel_eflags;
++extern asmlinkage void ignore_sysret(void);
++#else /* X86_64 */
++#ifdef CONFIG_CC_STACKPROTECTOR
++/*
++ * Make sure stack canary segment base is cached-aligned:
++ *   "For Intel Atom processors, avoid non zero segment base address
++ *    that is not aligned to cache line boundary at all cost."
++ * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
++ */
++struct stack_canary {
++      char __pad[20];         /* canary at %gs:20 */
++      unsigned long canary;
++};
++DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
++#endif
++#endif        /* X86_64 */
++
++extern unsigned int xstate_size;
++extern void free_thread_xstate(struct task_struct *);
++extern struct kmem_cache *task_xstate_cachep;
++
++struct perf_event;
++
++struct thread_struct {
++      /* Cached TLS descriptors: */
++      struct desc_struct      tls_array[GDT_ENTRY_TLS_ENTRIES];
++      unsigned long           sp0;
++      unsigned long           sp;
++#ifdef CONFIG_X86_32
++      unsigned long           sysenter_cs;
++#else
++      unsigned short          es;
++      unsigned short          ds;
++      unsigned short          fsindex;
++      unsigned short          gsindex;
++#endif
++#ifdef CONFIG_X86_32
++      unsigned long           ip;
++#endif
++#ifdef CONFIG_X86_64
++      unsigned long           fs;
++#endif
++      unsigned long           gs;
++      /* Save middle states of ptrace breakpoints */
++      struct perf_event       *ptrace_bps[HBP_NUM];
++      /* Debug status used for traps, single steps, etc... */
++      unsigned long           debugreg6;
++      /* Keep track of the exact dr7 value set by the user */
++      unsigned long           ptrace_dr7;
++      /* Fault info: */
++      unsigned long           cr2;
++      unsigned long           trap_no;
++      unsigned long           error_code;
++      /* floating point and extended processor state */
++      struct fpu              fpu;
++#ifdef CONFIG_X86_32
++      /* Virtual 86 mode info */
++      struct vm86_struct __user *vm86_info;
++      unsigned long           screen_bitmap;
++      unsigned long           v86flags, v86mask, saved_sp0;
++      unsigned int            saved_fs, saved_gs;
++#endif
++      /* IO permissions: */
++      unsigned long           *io_bitmap_ptr;
++      unsigned long           iopl;
++      /* Max allowed port in the bitmap, in bytes: */
++      unsigned                io_bitmap_max;
++};
++
++static inline unsigned long xen_get_debugreg(int regno)
++{
++      return HYPERVISOR_get_debugreg(regno);
++}
++
++static inline void xen_set_debugreg(int regno, unsigned long value)
++{
++      WARN_ON(HYPERVISOR_set_debugreg(regno, value));
++}
++
++/*
++ * Set IOPL bits in EFLAGS from given mask
++ */
++static inline void xen_set_iopl_mask(unsigned mask)
++{
++      struct physdev_set_iopl set_iopl;
++
++      /* Force the change at ring 0. */
++      set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
++      WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
++}
++
++#ifndef CONFIG_X86_NO_TSS
++static inline void
++native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
++{
++      tss->x86_tss.sp0 = thread->sp0;
++#ifdef CONFIG_X86_32
++      /* Only happens when SEP is enabled, no need to test "SEP"arately: */
++      if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
++              tss->x86_tss.ss1 = thread->sysenter_cs;
++              wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
++      }
++#endif
++}
++#else
++#define xen_load_sp0(tss, thread) do { \
++      if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
++              BUG(); \
++} while (0)
++#endif
++
++#define __cpuid                       xen_cpuid
++#define paravirt_enabled()    1
++
++/*
++ * These special macros can be used to get or set a debugging register
++ */
++#define get_debugreg(var, register)                           \
++      (var) = xen_get_debugreg(register)
++#define set_debugreg(value, register)                         \
++      xen_set_debugreg(register, value)
++
++#define load_sp0 xen_load_sp0
++
++#define set_iopl_mask xen_set_iopl_mask
++
++/*
++ * Save the cr4 feature set we're using (ie
++ * Pentium 4MB enable and PPro Global page
++ * enable), so that any CPU's that boot up
++ * after us can get the correct flags.
++ */
++extern unsigned long          mmu_cr4_features;
++
++static inline void set_in_cr4(unsigned long mask)
++{
++      unsigned long cr4;
++
++      mmu_cr4_features |= mask;
++      cr4 = read_cr4();
++      cr4 |= mask;
++      write_cr4(cr4);
++}
++
++static inline void clear_in_cr4(unsigned long mask)
++{
++      unsigned long cr4;
++
++      mmu_cr4_features &= ~mask;
++      cr4 = read_cr4();
++      cr4 &= ~mask;
++      write_cr4(cr4);
++}
++
++typedef struct {
++      unsigned long           seg;
++} mm_segment_t;
++
++
++/*
++ * create a kernel thread without removing it from tasklists
++ */
++extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
++
++/* Free all resources held by a thread. */
++extern void release_thread(struct task_struct *);
++
++/* Prepare to copy thread state - unlazy all lazy state */
++extern void prepare_to_copy(struct task_struct *tsk);
++
++unsigned long get_wchan(struct task_struct *p);
++
++/*
++ * Generic CPUID function
++ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
++ * resulting in stale register contents being returned.
++ */
++static inline void cpuid(unsigned int op,
++                       unsigned int *eax, unsigned int *ebx,
++                       unsigned int *ecx, unsigned int *edx)
++{
++      *eax = op;
++      *ecx = 0;
++      __cpuid(eax, ebx, ecx, edx);
++}
++
++/* Some CPUID calls want 'count' to be placed in ecx */
++static inline void cpuid_count(unsigned int op, int count,
++                             unsigned int *eax, unsigned int *ebx,
++                             unsigned int *ecx, unsigned int *edx)
++{
++      *eax = op;
++      *ecx = count;
++      __cpuid(eax, ebx, ecx, edx);
++}
++
++/*
++ * CPUID functions returning a single datum
++ */
++static inline unsigned int cpuid_eax(unsigned int op)
++{
++      unsigned int eax, ebx, ecx, edx;
++
++      cpuid(op, &eax, &ebx, &ecx, &edx);
++
++      return eax;
++}
++
++static inline unsigned int cpuid_ebx(unsigned int op)
++{
++      unsigned int eax, ebx, ecx, edx;
++
++      cpuid(op, &eax, &ebx, &ecx, &edx);
++
++      return ebx;
++}
++
++static inline unsigned int cpuid_ecx(unsigned int op)
++{
++      unsigned int eax, ebx, ecx, edx;
++
++      cpuid(op, &eax, &ebx, &ecx, &edx);
++
++      return ecx;
++}
++
++static inline unsigned int cpuid_edx(unsigned int op)
++{
++      unsigned int eax, ebx, ecx, edx;
++
++      cpuid(op, &eax, &ebx, &ecx, &edx);
++
++      return edx;
++}
++
++/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
++static inline void rep_nop(void)
++{
++      asm volatile("rep; nop" ::: "memory");
++}
++
++static inline void cpu_relax(void)
++{
++      rep_nop();
++}
++
++/* Stop speculative execution and prefetching of modified code. */
++static inline void sync_core(void)
++{
++      int tmp;
++
++#if defined(CONFIG_M386) || defined(CONFIG_M486)
++      if (boot_cpu_data.x86 < 5)
++              /* There is no speculative execution.
++               * jmp is a barrier to prefetching. */
++              asm volatile("jmp 1f\n1:\n" ::: "memory");
++      else
++#endif
++              /* cpuid is a barrier to speculative execution.
++               * Prefetched instructions are automatically
++               * invalidated when modified. */
++              asm volatile("cpuid" : "=a" (tmp) : "0" (1)
++                           : "ebx", "ecx", "edx", "memory");
++}
++
++static inline void __monitor(const void *eax, unsigned long ecx,
++                           unsigned long edx)
++{
++      /* "monitor %eax, %ecx, %edx;" */
++      asm volatile(".byte 0x0f, 0x01, 0xc8;"
++                   :: "a" (eax), "c" (ecx), "d"(edx));
++}
++
++static inline void __mwait(unsigned long eax, unsigned long ecx)
++{
++      /* "mwait %eax, %ecx;" */
++      asm volatile(".byte 0x0f, 0x01, 0xc9;"
++                   :: "a" (eax), "c" (ecx));
++}
++
++static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
++{
++      trace_hardirqs_on();
++      /* "mwait %eax, %ecx;" */
++      asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
++                   :: "a" (eax), "c" (ecx));
++}
++
++extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
++
++extern void select_idle_routine(const struct cpuinfo_x86 *c);
++extern void init_c1e_mask(void);
++
++extern unsigned long          boot_option_idle_override;
++extern bool                   c1e_detected;
++
++enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
++                       IDLE_POLL, IDLE_FORCE_MWAIT};
++
++extern void enable_sep_cpu(void);
++extern int sysenter_setup(void);
++
++extern void early_trap_init(void);
++
++/* Defined in head.S */
++extern struct desc_ptr                early_gdt_descr;
++
++extern void cpu_set_gdt(int);
++extern void switch_to_new_gdt(int);
++extern void load_percpu_segment(int);
++extern void cpu_init(void);
++
++static inline unsigned long get_debugctlmsr(void)
++{
++      unsigned long debugctlmsr = 0;
++
++#ifndef CONFIG_X86_DEBUGCTLMSR
++      if (boot_cpu_data.x86 < 6)
++              return 0;
++#endif
++      rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
++
++      return debugctlmsr;
++}
++
++static inline void update_debugctlmsr(unsigned long debugctlmsr)
++{
++#ifndef CONFIG_X86_DEBUGCTLMSR
++      if (boot_cpu_data.x86 < 6)
++              return;
++#endif
++      wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
++}
++
++/*
++ * from system description table in BIOS. Mostly for MCA use, but
++ * others may find it useful:
++ */
++extern unsigned int           machine_id;
++extern unsigned int           machine_submodel_id;
++extern unsigned int           BIOS_revision;
++
++/* Boot loader type from the setup header: */
++extern int                    bootloader_type;
++extern int                    bootloader_version;
++
++extern char                   ignore_fpu_irq;
++
++#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
++#define ARCH_HAS_PREFETCHW
++#define ARCH_HAS_SPINLOCK_PREFETCH
++
++#ifdef CONFIG_X86_32
++# define BASE_PREFETCH                ASM_NOP4
++# define ARCH_HAS_PREFETCH
++#else
++# define BASE_PREFETCH                "prefetcht0 (%1)"
++#endif
++
++/*
++ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
++ *
++ * It's not worth to care about 3dnow prefetches for the K6
++ * because they are microcoded there and very slow.
++ */
++static inline void prefetch(const void *x)
++{
++      alternative_input(BASE_PREFETCH,
++                        "prefetchnta (%1)",
++                        X86_FEATURE_XMM,
++                        "r" (x));
++}
++
++/*
++ * 3dnow prefetch to get an exclusive cache line.
++ * Useful for spinlocks to avoid one state transition in the
++ * cache coherency protocol:
++ */
++static inline void prefetchw(const void *x)
++{
++      alternative_input(BASE_PREFETCH,
++                        "prefetchw (%1)",
++                        X86_FEATURE_3DNOW,
++                        "r" (x));
++}
++
++static inline void spin_lock_prefetch(const void *x)
++{
++      prefetchw(x);
++}
++
++#ifdef CONFIG_X86_32
++/*
++ * User space process size: 3GB (default).
++ */
++#define TASK_SIZE             PAGE_OFFSET
++#define TASK_SIZE_MAX         TASK_SIZE
++#define STACK_TOP             TASK_SIZE
++#define STACK_TOP_MAX         STACK_TOP
++
++#define INIT_THREAD  {                                                          \
++      .sp0                    = sizeof(init_stack) + (long)&init_stack, \
++      .vm86_info              = NULL,                                   \
++      .sysenter_cs            = __KERNEL_CS,                            \
++      .io_bitmap_ptr          = NULL,                                   \
++}
++
++/*
++ * Note that the .io_bitmap member must be extra-big. This is because
++ * the CPU will access an additional byte beyond the end of the IO
++ * permission bitmap. The extra byte must be all 1 bits, and must
++ * be within the limit.
++ */
++#define INIT_TSS  {                                                     \
++      .x86_tss = {                                                      \
++              .sp0            = sizeof(init_stack) + (long)&init_stack, \
++              .ss0            = __KERNEL_DS,                            \
++              .ss1            = __KERNEL_CS,                            \
++              .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,               \
++       },                                                               \
++      .io_bitmap              = { [0 ... IO_BITMAP_LONGS] = ~0 },       \
++}
++
++extern unsigned long thread_saved_pc(struct task_struct *tsk);
++
++#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
++#define KSTK_TOP(info)                                                 \
++({                                                                     \
++       unsigned long *__ptr = (unsigned long *)(info);                 \
++       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
++})
++
++/*
++ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
++ * This is necessary to guarantee that the entire "struct pt_regs"
++ * is accessible even if the CPU haven't stored the SS/ESP registers
++ * on the stack (interrupt gate does not save these registers
++ * when switching to the same priv ring).
++ * Therefore beware: accessing the ss/esp fields of the
++ * "struct pt_regs" is possible, but they may contain the
++ * completely wrong values.
++ */
++#define task_pt_regs(task)                                             \
++({                                                                     \
++       struct pt_regs *__regs__;                                       \
++       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
++       __regs__ - 1;                                                   \
++})
++
++#else
++/*
++ * User space process size. 47bits minus one guard page.
++ */
++#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
++
++/* This decides where the kernel will search for a free chunk of vm
++ * space during mmap's.
++ */
++#define IA32_PAGE_OFFSET      ((current->personality & ADDR_LIMIT_3GB) ? \
++                                      0xc0000000 : 0xFFFFe000)
++
++#define TASK_SIZE             (test_thread_flag(TIF_IA32) ? \
++                                      IA32_PAGE_OFFSET : TASK_SIZE_MAX)
++#define TASK_SIZE_OF(child)   ((test_tsk_thread_flag(child, TIF_IA32)) ? \
++                                      IA32_PAGE_OFFSET : TASK_SIZE_MAX)
++
++#define STACK_TOP             TASK_SIZE
++#define STACK_TOP_MAX         TASK_SIZE_MAX
++
++#define INIT_THREAD  { \
++      .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
++}
++
++#define INIT_TSS  { \
++      .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
++}
++
++/*
++ * Return saved PC of a blocked thread.
++ * What is this good for? it will be always the scheduler or ret_from_fork.
++ */
++#define thread_saved_pc(t)    (*(unsigned long *)((t)->thread.sp - 8))
++
++#define task_pt_regs(tsk)     ((struct pt_regs *)(tsk)->thread.sp0 - 1)
++#endif /* CONFIG_X86_64 */
++
++extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
++                                             unsigned long new_sp);
++
++/*
++ * This decides where the kernel will search for a free chunk of vm
++ * space during mmap's.
++ */
++#define TASK_UNMAPPED_BASE    (PAGE_ALIGN(TASK_SIZE / 3))
++
++#define KSTK_EIP(task)                (task_pt_regs(task)->ip)
++#define KSTK_ESP(task)                (task_pt_regs(task)->sp)
++
++/* Get/set a process' ability to use the timestamp counter instruction */
++#define GET_TSC_CTL(adr)      get_tsc_mode((adr))
++#define SET_TSC_CTL(val)      set_tsc_mode((val))
++
++extern int get_tsc_mode(unsigned long adr);
++extern int set_tsc_mode(unsigned int val);
++
++extern int amd_get_nb_id(int cpu);
++
++struct aperfmperf {
++      u64 aperf, mperf;
++};
++
++static inline void get_aperfmperf(struct aperfmperf *am)
++{
++      WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
++
++      rdmsrl(MSR_IA32_APERF, am->aperf);
++      rdmsrl(MSR_IA32_MPERF, am->mperf);
++}
++
++#define APERFMPERF_SHIFT 10
++
++static inline
++unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
++                                  struct aperfmperf *new)
++{
++      u64 aperf = new->aperf - old->aperf;
++      u64 mperf = new->mperf - old->mperf;
++      unsigned long ratio = aperf;
++
++      mperf >>= APERFMPERF_SHIFT;
++      if (mperf)
++              ratio = div64_u64(aperf, mperf);
++
++      return ratio;
++}
++
++/*
++ * AMD errata checking
++ */
++#ifdef CONFIG_CPU_SUP_AMD
++extern const int amd_erratum_383[];
++extern const int amd_erratum_400[];
++extern bool cpu_has_amd_erratum(const int *);
++
++#define AMD_LEGACY_ERRATUM(...)               { -1, __VA_ARGS__, 0 }
++#define AMD_OSVW_ERRATUM(osvw_id, ...)        { osvw_id, __VA_ARGS__, 0 }
++#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
++      ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
++#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
++#define AMD_MODEL_RANGE_START(range)  (((range) >> 12) & 0xfff)
++#define AMD_MODEL_RANGE_END(range)    ((range) & 0xfff)
++
++#else
++#define cpu_has_amd_erratum(x)        (false)
++#endif /* CONFIG_CPU_SUP_AMD */
++
++#endif /* _ASM_X86_PROCESSOR_H */
diff --cc arch/x86/include/mach-xen/asm/setup.h

index 0000000,0000000..f2d7876

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/setup.h
@@@ -1,0 -1,0 +1,15 @@@
++#ifndef __ASSEMBLY__
++
++void xen_start_kernel(void);
++void xen_arch_setup(void);
++
++#ifdef CONFIG_X86_64
++void reserve_pfn_range(unsigned long pfn, unsigned long nr, char *);
++void reserve_pgtable_low(void);
++#endif
++
++extern unsigned long xen_initrd_start;
++
++#endif
++
++#include_next <asm/setup.h>
diff --cc arch/x86/include/mach-xen/asm/smp-processor-id.h

index 0000000,0000000..c6c1ec5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/smp-processor-id.h
@@@ -1,0 -1,0 +1,36 @@@
++#ifndef _ASM_X86_SMP_PROCESSOR_ID_H
++#define _ASM_X86_SMP_PROCESSOR_ID_H
++
++#if defined(CONFIG_SMP) && !defined(__ASSEMBLY__)
++
++#include <asm/percpu.h>
++
++DECLARE_PER_CPU(int, cpu_number);
++
++/*
++ * This function is needed by all SMP systems. It must _always_ be valid
++ * from the initial startup. We map APIC_BASE very early in page_setup(),
++ * so this is correct in the x86 case.
++ */
++#define raw_smp_processor_id() percpu_read(cpu_number)
++#define safe_smp_processor_id() smp_processor_id()
++
++#ifdef CONFIG_X86_64_SMP
++#define stack_smp_processor_id()                                      \
++({                                                                    \
++      struct thread_info *ti;                                         \
++      __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
++      ti->cpu;                                                        \
++})
++#endif
++
++#ifdef CONFIG_DEBUG_PREEMPT
++extern unsigned int debug_smp_processor_id(void);
++# define smp_processor_id() debug_smp_processor_id()
++#else
++# define smp_processor_id() raw_smp_processor_id()
++#endif
++
++#endif /* SMP && !__ASSEMBLY__ */
++
++#endif /* _ASM_X86_SMP_PROCESSOR_ID_H */
diff --cc arch/x86/include/mach-xen/asm/smp.h

index 0000000,0000000..810411e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/smp.h
@@@ -1,0 -1,0 +1,235 @@@
++#ifndef _ASM_X86_SMP_H
++#define _ASM_X86_SMP_H
++#ifndef __ASSEMBLY__
++#include <linux/cpumask.h>
++#include <linux/init.h>
++#include <asm/percpu.h>
++
++/*
++ * We need the APIC definitions automatically as part of 'smp.h'
++ */
++#ifdef CONFIG_X86_LOCAL_APIC
++# include <asm/mpspec.h>
++# include <asm/apic.h>
++# ifdef CONFIG_X86_IO_APIC
++#  include <asm/io_apic.h>
++# endif
++#endif
++#include <linux/thread_info.h>
++#include <asm/cpumask.h>
++#include <asm/cpufeature.h>
++
++extern unsigned int num_processors;
++
++#ifndef CONFIG_XEN
++static inline bool cpu_has_ht_siblings(void)
++{
++      bool has_siblings = false;
++#ifdef CONFIG_SMP
++      has_siblings = cpu_has_ht && smp_num_siblings > 1;
++#endif
++      return has_siblings;
++}
++
++DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
++DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
++/* cpus sharing the last level cache: */
++DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
++DECLARE_PER_CPU(u16, cpu_llc_id);
++DECLARE_PER_CPU(int, cpu_number);
++#endif
++
++static inline const struct cpumask *cpu_sibling_mask(int cpu)
++{
++      return cpumask_of(cpu);
++}
++
++static inline const struct cpumask *cpu_core_mask(int cpu)
++{
++      return cpumask_of(cpu);
++}
++
++#ifndef CONFIG_XEN
++static inline struct cpumask *cpu_llc_shared_mask(int cpu)
++{
++      return per_cpu(cpu_llc_shared_map, cpu);
++}
++
++DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
++DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
++#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
++DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
++#endif
++#endif
++
++#ifdef CONFIG_SMP
++
++#ifndef CONFIG_XEN
++
++/* Static state in head.S used to set up a CPU */
++extern unsigned long stack_start; /* Initial stack pointer address */
++
++struct smp_ops {
++      void (*smp_prepare_boot_cpu)(void);
++      void (*smp_prepare_cpus)(unsigned max_cpus);
++      void (*smp_cpus_done)(unsigned max_cpus);
++
++      void (*stop_other_cpus)(int wait);
++      void (*smp_send_reschedule)(int cpu);
++
++      int (*cpu_up)(unsigned cpu);
++      int (*cpu_disable)(void);
++      void (*cpu_die)(unsigned int cpu);
++      void (*play_dead)(void);
++
++      void (*send_call_func_ipi)(const struct cpumask *mask);
++      void (*send_call_func_single_ipi)(int cpu);
++};
++
++/* Globals due to paravirt */
++extern void set_cpu_sibling_map(int cpu);
++
++extern struct smp_ops smp_ops;
++
++static inline void smp_send_stop(void)
++{
++      smp_ops.stop_other_cpus(0);
++}
++
++static inline void stop_other_cpus(void)
++{
++      smp_ops.stop_other_cpus(1);
++}
++
++static inline void smp_prepare_boot_cpu(void)
++{
++      smp_ops.smp_prepare_boot_cpu();
++}
++
++static inline void smp_prepare_cpus(unsigned int max_cpus)
++{
++      smp_ops.smp_prepare_cpus(max_cpus);
++}
++
++static inline void smp_cpus_done(unsigned int max_cpus)
++{
++      smp_ops.smp_cpus_done(max_cpus);
++}
++
++static inline int __cpu_up(unsigned int cpu)
++{
++      return smp_ops.cpu_up(cpu);
++}
++
++static inline int __cpu_disable(void)
++{
++      return smp_ops.cpu_disable();
++}
++
++static inline void __cpu_die(unsigned int cpu)
++{
++      smp_ops.cpu_die(cpu);
++}
++
++static inline void play_dead(void)
++{
++      smp_ops.play_dead();
++}
++
++static inline void smp_send_reschedule(int cpu)
++{
++      smp_ops.smp_send_reschedule(cpu);
++}
++
++static inline void arch_send_call_function_single_ipi(int cpu)
++{
++      smp_ops.send_call_func_single_ipi(cpu);
++}
++
++static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
++{
++      smp_ops.send_call_func_ipi(mask);
++}
++
++void cpu_disable_common(void);
++void native_smp_prepare_boot_cpu(void);
++void native_smp_prepare_cpus(unsigned int max_cpus);
++void native_smp_cpus_done(unsigned int max_cpus);
++int native_cpu_up(unsigned int cpunum);
++int native_cpu_disable(void);
++void native_cpu_die(unsigned int cpu);
++void native_play_dead(void);
++void play_dead_common(void);
++void wbinvd_on_cpu(int cpu);
++int wbinvd_on_all_cpus(void);
++
++void smp_store_cpu_info(int id);
++#define cpu_physical_id(cpu)  per_cpu(x86_cpu_to_apicid, cpu)
++
++#else /* CONFIG_XEN */
++
++extern int __cpu_disable(void);
++extern void __cpu_die(unsigned int cpu);
++void xen_stop_other_cpus(int wait);
++void xen_smp_send_reschedule(int cpu);
++void xen_send_call_func_ipi(const struct cpumask *mask);
++void xen_send_call_func_single_ipi(int cpu);
++
++static inline void smp_send_stop(void)
++{
++      xen_stop_other_cpus(0);
++}
++
++#define smp_send_reschedule   xen_smp_send_reschedule
++#define arch_send_call_function_single_ipi    xen_send_call_func_single_ipi
++#define arch_send_call_function_ipi_mask      xen_send_call_func_ipi
++
++void play_dead(void);
++
++#endif /* CONFIG_XEN */
++
++/* We don't mark CPUs online until __cpu_up(), so we need another measure */
++static inline int num_booting_cpus(void)
++{
++      return cpumask_weight(cpu_callout_mask);
++}
++#elif /* !CONFIG_SMP && */ !defined(CONFIG_XEN)
++#define wbinvd_on_cpu(cpu)     wbinvd()
++static inline int wbinvd_on_all_cpus(void)
++{
++      wbinvd();
++      return 0;
++}
++#endif /* CONFIG_SMP */
++
++#ifdef CONFIG_XEN
++int wbinvd_on_all_cpus(void);
++#endif
++
++extern unsigned disabled_cpus __cpuinitdata;
++
++#include <asm/smp-processor-id.h>
++
++#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
++
++#ifndef CONFIG_X86_64
++static inline int logical_smp_processor_id(void)
++{
++      /* we don't want to mark this access volatile - bad code generation */
++      return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
++}
++
++#endif
++
++extern int hard_smp_processor_id(void);
++
++#else /* CONFIG_X86_LOCAL_APIC */
++
++# ifndef CONFIG_SMP
++#  define hard_smp_processor_id()     0
++# endif
++
++#endif /* CONFIG_X86_LOCAL_APIC */
++
++#endif /* __ASSEMBLY__ */
++#endif /* _ASM_X86_SMP_H */
diff --cc arch/x86/include/mach-xen/asm/spinlock.h

index 0000000,0000000..a19814d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/spinlock.h
@@@ -1,0 -1,0 +1,453 @@@
++#ifndef _ASM_X86_SPINLOCK_H
++#define _ASM_X86_SPINLOCK_H
++
++#include <asm/atomic.h>
++#include <asm/rwlock.h>
++#include <asm/page.h>
++#include <asm/processor.h>
++#include <linux/compiler.h>
++
++/*
++ * Your basic SMP spinlocks, allowing only a single CPU anywhere
++ *
++ * Simple spin lock operations.  There are two variants, one clears IRQ's
++ * on the local processor, one does not.
++ *
++ * These are fair FIFO ticket locks, which are currently limited to 256
++ * CPUs.
++ *
++ * (the type definitions are in asm/spinlock_types.h)
++ */
++
++#ifdef CONFIG_X86_32
++# define LOCK_PTR_REG "a"
++# define REG_PTR_MODE "k"
++#else
++# define LOCK_PTR_REG "D"
++# define REG_PTR_MODE "q"
++#endif
++
++#if defined(CONFIG_X86_32) && \
++      (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
++/*
++ * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
++ * (PPro errata 66, 92)
++ */
++# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
++#else
++# define UNLOCK_LOCK_PREFIX
++#endif
++
++#ifdef TICKET_SHIFT
++
++#include <asm/irqflags.h>
++#include <asm/smp-processor-id.h>
++
++int xen_spinlock_init(unsigned int cpu);
++void xen_spinlock_cleanup(unsigned int cpu);
++unsigned int xen_spin_wait(arch_spinlock_t *, unsigned int *token,
++                         unsigned int flags);
++unsigned int xen_spin_adjust(const arch_spinlock_t *, unsigned int token);
++void xen_spin_kick(arch_spinlock_t *, unsigned int token);
++
++/*
++ * Ticket locks are conceptually two parts, one indicating the current head of
++ * the queue, and the other indicating the current tail. The lock is acquired
++ * by atomically noting the tail and incrementing it by one (thus adding
++ * ourself to the queue and noting our position), then waiting until the head
++ * becomes equal to the the initial value of the tail.
++ *
++ * We use an xadd covering *both* parts of the lock, to increment the tail and
++ * also load the position of the head, which takes care of memory ordering
++ * issues and should be optimal for the uncontended case. Note the tail must be
++ * in the high part, because a wide xadd increment of the low part would carry
++ * up and contaminate the high part.
++ *
++ * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
++ * save some instructions and make the code more elegant. There really isn't
++ * much between them in performance though, especially as locks are out of line.
++ */
++#if TICKET_SHIFT == 8
++#define __ticket_spin_lock_preamble \
++      asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \
++          "cmpb %h0, %b0\n\t" \
++          "sete %1" \
++          : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \
++          : "0" (0x0100) \
++          : "memory", "cc")
++#define __ticket_spin_lock_body \
++      asm("1:\t" \
++          "cmpb %h0, %b0\n\t" \
++          "je 2f\n\t" \
++          "decl %1\n\t" \
++          "jz 2f\n\t" \
++          "rep ; nop\n\t" \
++          "movb %2, %b0\n\t" \
++          /* don't need lfence here, because loads are in-order */ \
++          "jmp 1b\n" \
++          "2:" \
++          : "+Q" (token), "+g" (count) \
++          : "m" (lock->slock) \
++          : "memory", "cc")
++#define __ticket_spin_unlock_body \
++      asm(UNLOCK_LOCK_PREFIX "incb %2\n\t" \
++          "movzwl %2, %0\n\t" \
++          "cmpb %h0, %b0\n\t" \
++          "setne %1" \
++          : "=&Q" (token), "=qm" (kick), "+m" (lock->slock) \
++          : \
++          : "memory", "cc")
++
++static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
++{
++      int tmp, new;
++
++      asm("movzwl %2, %0\n\t"
++          "cmpb %h0, %b0\n\t"
++          "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
++          "jne 1f\n\t"
++          LOCK_PREFIX "cmpxchgw %w1, %2\n\t"
++          "1:\t"
++          "sete %b1\n\t"
++          "movzbl %b1, %0\n\t"
++          : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
++          :
++          : "memory", "cc");
++
++      if (tmp)
++              lock->owner = raw_smp_processor_id();
++
++      return tmp;
++}
++#elif TICKET_SHIFT == 16
++#define __ticket_spin_lock_preamble \
++      do { \
++              unsigned int tmp; \
++              asm(LOCK_PREFIX "xaddl %0, %2\n\t" \
++                  "shldl $16, %0, %3\n\t" \
++                  "cmpw %w3, %w0\n\t" \
++                  "sete %1" \
++                  : "=&r" (token), "=qm" (free), "+m" (lock->slock), \
++                    "=&g" (tmp) \
++                  : "0" (0x00010000) \
++                  : "memory", "cc"); \
++      } while (0)
++#define __ticket_spin_lock_body \
++      do { \
++              unsigned int tmp; \
++              asm("shldl $16, %0, %2\n" \
++                  "1:\t" \
++                  "cmpw %w2, %w0\n\t" \
++                  "je 2f\n\t" \
++                  "decl %1\n\t" \
++                  "jz 2f\n\t" \
++                  "rep ; nop\n\t" \
++                  "movw %3, %w0\n\t" \
++                  /* don't need lfence here, because loads are in-order */ \
++                  "jmp 1b\n" \
++                  "2:" \
++                  : "+r" (token), "+g" (count), "=&g" (tmp) \
++                  : "m" (lock->slock) \
++                  : "memory", "cc"); \
++      } while (0)
++#define __ticket_spin_unlock_body \
++      do { \
++              unsigned int tmp; \
++              asm(UNLOCK_LOCK_PREFIX "incw %2\n\t" \
++                  "movl %2, %0\n\t" \
++                  "shldl $16, %0, %3\n\t" \
++                  "cmpw %w3, %w0\n\t" \
++                  "setne %1" \
++                  : "=&r" (token), "=qm" (kick), "+m" (lock->slock), \
++                    "=&r" (tmp) \
++                  : \
++                  : "memory", "cc"); \
++      } while (0)
++
++static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
++{
++      int tmp;
++      int new;
++
++      asm("movl %2, %0\n\t"
++          "movl %0, %1\n\t"
++          "roll $16, %0\n\t"
++          "cmpl %0, %1\n\t"
++          "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
++          "jne 1f\n\t"
++          LOCK_PREFIX "cmpxchgl %1, %2\n"
++          "1:\t"
++          "sete %b1\n\t"
++          "movzbl %b1, %0\n\t"
++          : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
++          :
++          : "memory", "cc");
++
++      if (tmp)
++              lock->owner = raw_smp_processor_id();
++
++      return tmp;
++}
++#endif
++
++#define __ticket_spin_count(lock) (vcpu_running((lock)->owner) ? 1 << 10 : 1)
++
++static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
++{
++      int tmp = ACCESS_ONCE(lock->slock);
++
++      return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
++}
++
++static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
++{
++      int tmp = ACCESS_ONCE(lock->slock);
++
++      return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
++}
++
++static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
++{
++      unsigned int token, count;
++      unsigned int flags = arch_local_irq_save();
++      bool free;
++
++      __ticket_spin_lock_preamble;
++      if (likely(free))
++              arch_local_irq_restore(flags);
++      else {
++              token = xen_spin_adjust(lock, token);
++              arch_local_irq_restore(flags);
++              count = __ticket_spin_count(lock);
++              do {
++                      __ticket_spin_lock_body;
++              } while (unlikely(!count)
++                       && (count = xen_spin_wait(lock, &token, flags)));
++      }
++      lock->owner = raw_smp_processor_id();
++}
++
++static __always_inline void __ticket_spin_lock_flags(arch_spinlock_t *lock,
++                                                   unsigned long flags)
++{
++      unsigned int token, count;
++      bool free;
++
++      __ticket_spin_lock_preamble;
++      if (unlikely(!free)) {
++              token = xen_spin_adjust(lock, token);
++              count = __ticket_spin_count(lock);
++              do {
++                      __ticket_spin_lock_body;
++              } while (unlikely(!count)
++                       && (count = xen_spin_wait(lock, &token, flags)));
++      }
++      lock->owner = raw_smp_processor_id();
++}
++
++static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
++{
++      unsigned int token;
++      bool kick;
++
++      __ticket_spin_unlock_body;
++      if (kick)
++              xen_spin_kick(lock, token);
++}
++
++#ifndef XEN_SPINLOCK_SOURCE
++#undef __ticket_spin_lock_preamble
++#undef __ticket_spin_lock_body
++#undef __ticket_spin_unlock_body
++#undef __ticket_spin_count
++#endif
++
++#define __arch_spin(n) __ticket_spin_##n
++
++#else /* TICKET_SHIFT */
++
++static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
++static inline void xen_spinlock_cleanup(unsigned int cpu) {}
++
++static inline int __byte_spin_is_locked(arch_spinlock_t *lock)
++{
++      return lock->lock != 0;
++}
++
++static inline int __byte_spin_is_contended(arch_spinlock_t *lock)
++{
++      return lock->spinners != 0;
++}
++
++static inline void __byte_spin_lock(arch_spinlock_t *lock)
++{
++      s8 val = 1;
++
++      asm("1: xchgb %1, %0\n"
++          "   test %1,%1\n"
++          "   jz 3f\n"
++          "   " LOCK_PREFIX "incb %2\n"
++          "2: rep;nop\n"
++          "   cmpb $1, %0\n"
++          "   je 2b\n"
++          "   " LOCK_PREFIX "decb %2\n"
++          "   jmp 1b\n"
++          "3:"
++          : "+m" (lock->lock), "+q" (val), "+m" (lock->spinners): : "memory");
++}
++
++#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
++
++static inline int __byte_spin_trylock(arch_spinlock_t *lock)
++{
++      u8 old = 1;
++
++      asm("xchgb %1,%0"
++          : "+m" (lock->lock), "+q" (old) : : "memory");
++
++      return old == 0;
++}
++
++static inline void __byte_spin_unlock(arch_spinlock_t *lock)
++{
++      smp_wmb();
++      lock->lock = 0;
++}
++
++#define __arch_spin(n) __byte_spin_##n
++
++#endif /* TICKET_SHIFT */
++
++static inline int arch_spin_is_locked(arch_spinlock_t *lock)
++{
++      return __arch_spin(is_locked)(lock);
++}
++
++static inline int arch_spin_is_contended(arch_spinlock_t *lock)
++{
++      return __arch_spin(is_contended)(lock);
++}
++#define arch_spin_is_contended        arch_spin_is_contended
++
++static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
++{
++      __arch_spin(lock)(lock);
++}
++
++static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
++{
++      return __arch_spin(trylock)(lock);
++}
++
++static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
++{
++      __arch_spin(unlock)(lock);
++}
++
++static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
++                                                unsigned long flags)
++{
++      __arch_spin(lock_flags)(lock, flags);
++}
++
++#undef __arch_spin
++
++static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
++{
++      while (arch_spin_is_locked(lock))
++              cpu_relax();
++}
++
++/*
++ * Read-write spinlocks, allowing multiple readers
++ * but only one writer.
++ *
++ * NOTE! it is quite common to have readers in interrupts
++ * but no interrupt writers. For those circumstances we
++ * can "mix" irq-safe locks - any writer needs to get a
++ * irq-safe write-lock, but readers can get non-irqsafe
++ * read-locks.
++ *
++ * On x86, we implement read-write locks as a 32-bit counter
++ * with the high bit (sign) being the "contended" bit.
++ */
++
++/**
++ * read_can_lock - would read_trylock() succeed?
++ * @lock: the rwlock in question.
++ */
++static inline int arch_read_can_lock(arch_rwlock_t *lock)
++{
++      return (int)(lock)->lock > 0;
++}
++
++/**
++ * write_can_lock - would write_trylock() succeed?
++ * @lock: the rwlock in question.
++ */
++static inline int arch_write_can_lock(arch_rwlock_t *lock)
++{
++      return (lock)->lock == RW_LOCK_BIAS;
++}
++
++static inline void arch_read_lock(arch_rwlock_t *rw)
++{
++      asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
++                   "jns 1f\n"
++                   "call __read_lock_failed\n\t"
++                   "1:\n"
++                   ::LOCK_PTR_REG (rw) : "memory");
++}
++
++static inline void arch_write_lock(arch_rwlock_t *rw)
++{
++      asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
++                   "jz 1f\n"
++                   "call __write_lock_failed\n\t"
++                   "1:\n"
++                   ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
++}
++
++static inline int arch_read_trylock(arch_rwlock_t *lock)
++{
++      atomic_t *count = (atomic_t *)lock;
++
++      if (atomic_dec_return(count) >= 0)
++              return 1;
++      atomic_inc(count);
++      return 0;
++}
++
++static inline int arch_write_trylock(arch_rwlock_t *lock)
++{
++      atomic_t *count = (atomic_t *)lock;
++
++      if (atomic_sub_and_test(RW_LOCK_BIAS, count))
++              return 1;
++      atomic_add(RW_LOCK_BIAS, count);
++      return 0;
++}
++
++static inline void arch_read_unlock(arch_rwlock_t *rw)
++{
++      asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
++}
++
++static inline void arch_write_unlock(arch_rwlock_t *rw)
++{
++      asm volatile(LOCK_PREFIX "addl %1, %0"
++                   : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
++}
++
++#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
++#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
++
++#define arch_spin_relax(lock) cpu_relax()
++#define arch_read_relax(lock) cpu_relax()
++#define arch_write_relax(lock)        cpu_relax()
++
++/* The {read|write|spin}_lock() on x86 are full memory barriers. */
++static inline void smp_mb__after_lock(void) { }
++#define ARCH_HAS_SMP_MB_AFTER_LOCK
++
++#endif /* _ASM_X86_SPINLOCK_H */
diff --cc arch/x86/include/mach-xen/asm/spinlock_types.h

index 0000000,0000000..3c49514

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/spinlock_types.h
@@@ -1,0 -1,0 +1,60 @@@
++#ifndef _ASM_X86_SPINLOCK_TYPES_H
++#define _ASM_X86_SPINLOCK_TYPES_H
++
++#ifndef __LINUX_SPINLOCK_TYPES_H
++# error "please don't include this file directly"
++#endif
++
++#include <asm/types.h>
++
++typedef union {
++      unsigned int slock;
++      struct {
++/*
++ * Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
++ */
++#if CONFIG_XEN_COMPAT >= 0x030200
++/*
++ * On Xen we support a single level of interrupt re-enabling per lock. Hence
++ * we can have twice as many outstanding tickets. Thus the cut-off for using
++ * byte register pairs must be at half the number of CPUs.
++ */
++#if 2 * CONFIG_NR_CPUS < 256
++# define TICKET_SHIFT 8
++              u8 cur, seq;
++#else
++# define TICKET_SHIFT 16
++              u16 cur, seq;
++#endif
++#if CONFIG_NR_CPUS <= 256
++              u8 owner;
++#else
++              u16 owner;
++#endif
++#else
++/*
++ * This differs from the pre-2.6.24 spinlock by always using xchgb
++ * rather than decb to take the lock; this allows it to use a
++ * zero-initialized lock structure.  It also maintains a 1-byte
++ * contention counter, so that we can implement
++ * __byte_spin_is_contended.
++ */
++              u8 lock;
++#if CONFIG_NR_CPUS < 256
++              u8 spinners;
++#else
++# error NR_CPUS >= 256 not implemented
++#endif
++#endif
++      };
++} arch_spinlock_t;
++
++#define __ARCH_SPIN_LOCK_UNLOCKED     { 0 }
++
++typedef struct {
++      unsigned int lock;
++} arch_rwlock_t;
++
++#define __ARCH_RW_LOCK_UNLOCKED               { RW_LOCK_BIAS }
++
++#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --cc arch/x86/include/mach-xen/asm/swiotlb.h

index 0000000,0000000..e82aad1

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/swiotlb.h
@@@ -1,0 -1,0 +1,8 @@@
++#include_next <asm/swiotlb.h>
++
++#ifndef CONFIG_SWIOTLB
++#define swiotlb_init(verbose) ((void)(verbose))
++#endif
++
++dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size,
++                                 int dir);
diff --cc arch/x86/include/mach-xen/asm/synch_bitops.h

index 0000000,0000000..be5f59b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/synch_bitops.h
@@@ -1,0 -1,0 +1,126 @@@
++#ifndef __XEN_SYNCH_BITOPS_H__
++#define __XEN_SYNCH_BITOPS_H__
++
++/*
++ * Copyright 1992, Linus Torvalds.
++ * Heavily modified to provide guaranteed strong synchronisation
++ * when communicating with Xen or other guest OSes running on other CPUs.
++ */
++
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
++#define ADDR (*(volatile long *) addr)
++
++static __inline__ void synch_set_bit(int nr, volatile void * addr)
++{
++    __asm__ __volatile__ ( 
++        "lock btsl %1,%0"
++        : "+m" (ADDR) : "Ir" (nr) : "memory" );
++}
++
++static __inline__ void synch_clear_bit(int nr, volatile void * addr)
++{
++    __asm__ __volatile__ (
++        "lock btrl %1,%0"
++        : "+m" (ADDR) : "Ir" (nr) : "memory" );
++}
++
++static __inline__ void synch_change_bit(int nr, volatile void * addr)
++{
++    __asm__ __volatile__ (
++        "lock btcl %1,%0"
++        : "+m" (ADDR) : "Ir" (nr) : "memory" );
++}
++
++static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
++{
++    int oldbit;
++    __asm__ __volatile__ (
++        "lock btsl %2,%1\n\tsbbl %0,%0"
++        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
++    return oldbit;
++}
++
++static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
++{
++    int oldbit;
++    __asm__ __volatile__ (
++        "lock btrl %2,%1\n\tsbbl %0,%0"
++        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
++    return oldbit;
++}
++
++static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
++{
++    int oldbit;
++
++    __asm__ __volatile__ (
++        "lock btcl %2,%1\n\tsbbl %0,%0"
++        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
++    return oldbit;
++}
++
++struct __synch_xchg_dummy { unsigned long a[100]; };
++#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
++
++#define synch_cmpxchg(ptr, old, new) \
++((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
++                                     (unsigned long)(old), \
++                                     (unsigned long)(new), \
++                                     sizeof(*(ptr))))
++
++static inline unsigned long __synch_cmpxchg(volatile void *ptr,
++                                          unsigned long old,
++                                          unsigned long new, int size)
++{
++      unsigned long prev;
++      switch (size) {
++      case 1:
++              __asm__ __volatile__("lock; cmpxchgb %b1,%2"
++                                   : "=a"(prev)
++                                   : "q"(new), "m"(*__synch_xg(ptr)),
++                                     "0"(old)
++                                   : "memory");
++              return prev;
++      case 2:
++              __asm__ __volatile__("lock; cmpxchgw %w1,%2"
++                                   : "=a"(prev)
++                                   : "r"(new), "m"(*__synch_xg(ptr)),
++                                     "0"(old)
++                                   : "memory");
++              return prev;
++#ifdef CONFIG_X86_64
++      case 4:
++              __asm__ __volatile__("lock; cmpxchgl %k1,%2"
++                                   : "=a"(prev)
++                                   : "r"(new), "m"(*__synch_xg(ptr)),
++                                     "0"(old)
++                                   : "memory");
++              return prev;
++      case 8:
++              __asm__ __volatile__("lock; cmpxchgq %1,%2"
++                                   : "=a"(prev)
++                                   : "r"(new), "m"(*__synch_xg(ptr)),
++                                     "0"(old)
++                                   : "memory");
++              return prev;
++#else
++      case 4:
++              __asm__ __volatile__("lock; cmpxchgl %1,%2"
++                                   : "=a"(prev)
++                                   : "r"(new), "m"(*__synch_xg(ptr)),
++                                     "0"(old)
++                                   : "memory");
++              return prev;
++#endif
++      }
++      return old;
++}
++
++#define synch_test_bit test_bit
++
++#define synch_cmpxchg_subword synch_cmpxchg
++
++#endif /* __XEN_SYNCH_BITOPS_H__ */
diff --cc arch/x86/include/mach-xen/asm/system.h

index 0000000,0000000..a7890a5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/system.h
@@@ -1,0 -1,0 +1,459 @@@
++#ifndef _ASM_X86_SYSTEM_H
++#define _ASM_X86_SYSTEM_H
++
++#include <asm/asm.h>
++#include <asm/segment.h>
++#include <asm/cpufeature.h>
++#include <asm/cmpxchg.h>
++#include <asm/nops.h>
++#include <asm/hypervisor.h>
++
++#include <linux/kernel.h>
++#include <linux/irqflags.h>
++
++/* entries in ARCH_DLINFO: */
++#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
++# define AT_VECTOR_SIZE_ARCH 2
++#else /* else it's non-compat x86-64 */
++# define AT_VECTOR_SIZE_ARCH 1
++#endif
++
++struct task_struct; /* one of the stranger aspects of C forward declarations */
++struct task_struct *__switch_to(struct task_struct *prev,
++                              struct task_struct *next);
++void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
++extern void show_regs_common(void);
++
++#ifdef CONFIG_X86_32
++
++#ifdef CONFIG_CC_STACKPROTECTOR
++#define __switch_canary                                                       \
++      "movl %P[task_canary](%[next]), %%ebx\n\t"                      \
++      "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
++#define __switch_canary_oparam                                                \
++      , [stack_canary] "=m" (stack_canary.canary)
++#define __switch_canary_iparam                                                \
++      , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
++#else /* CC_STACKPROTECTOR */
++#define __switch_canary
++#define __switch_canary_oparam
++#define __switch_canary_iparam
++#endif        /* CC_STACKPROTECTOR */
++
++/*
++ * Saving eflags is important. It switches not only IOPL between tasks,
++ * it also protects other tasks from NT leaking through sysenter etc.
++ */
++#define switch_to(prev, next, last)                                   \
++do {                                                                  \
++      /*                                                              \
++       * Context-switching clobbers all registers, so we clobber      \
++       * them explicitly, via unused output variables.                \
++       * (EAX and EBP is not listed because EBP is saved/restored     \
++       * explicitly for wchan access and EAX is the return value of   \
++       * __switch_to())                                               \
++       */                                                             \
++      unsigned long ebx, ecx, edx, esi, edi;                          \
++                                                                      \
++      asm volatile("pushfl\n\t"               /* save    flags */     \
++                   "pushl %%ebp\n\t"          /* save    EBP   */     \
++                   "movl %%esp,%[prev_sp]\n\t"        /* save    ESP   */ \
++                   "movl %[next_sp],%%esp\n\t"        /* restore ESP   */ \
++                   "movl $1f,%[prev_ip]\n\t"  /* save    EIP   */     \
++                   "pushl %[next_ip]\n\t"     /* restore EIP   */     \
++                   __switch_canary                                    \
++                   "jmp __switch_to\n"        /* regparm call  */     \
++                   "1:\t"                                             \
++                   "popl %%ebp\n\t"           /* restore EBP   */     \
++                   "popfl\n"                  /* restore flags */     \
++                                                                      \
++                   /* output parameters */                            \
++                   : [prev_sp] "=m" (prev->thread.sp),                \
++                     [prev_ip] "=m" (prev->thread.ip),                \
++                     "=a" (last),                                     \
++                                                                      \
++                     /* clobbered output registers: */                \
++                     "=b" (ebx), "=c" (ecx), "=d" (edx),              \
++                     "=S" (esi), "=D" (edi)                           \
++                                                                      \
++                     __switch_canary_oparam                           \
++                                                                      \
++                     /* input parameters: */                          \
++                   : [next_sp]  "m" (next->thread.sp),                \
++                     [next_ip]  "m" (next->thread.ip),                \
++                                                                      \
++                     /* regparm parameters for __switch_to(): */      \
++                     [prev]     "a" (prev),                           \
++                     [next]     "d" (next)                            \
++                                                                      \
++                     __switch_canary_iparam                           \
++                                                                      \
++                   : /* reloaded segment registers */                 \
++                      "memory");                                      \
++} while (0)
++
++#ifndef CONFIG_XEN
++/*
++ * disable hlt during certain critical i/o operations
++ */
++#define HAVE_DISABLE_HLT
++#endif
++#else
++
++/* frame pointer must be last for get_wchan */
++#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
++#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
++
++#define __EXTRA_CLOBBER  \
++      , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
++        "r12", "r13", "r14", "r15"
++
++#ifdef CONFIG_CC_STACKPROTECTOR
++#define __switch_canary                                                         \
++      "movq %P[task_canary](%%rsi),%%r8\n\t"                            \
++      "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
++#define __switch_canary_oparam                                                  \
++      , [gs_canary] "=m" (irq_stack_union.stack_canary)
++#define __switch_canary_iparam                                                  \
++      , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
++#else /* CC_STACKPROTECTOR */
++#define __switch_canary
++#define __switch_canary_oparam
++#define __switch_canary_iparam
++#endif        /* CC_STACKPROTECTOR */
++
++/* The stack unwind code needs this but it pollutes traces otherwise */
++#ifdef CONFIG_UNWIND_INFO
++#define THREAD_RETURN_SYM \
++      ".globl thread_return\n" \
++      "thread_return:\n\t"
++#else
++#define THREAD_RETURN_SYM
++#endif
++
++/* Save restore flags to clear handle leaking NT */
++#define switch_to(prev, next, last) \
++      asm volatile(SAVE_CONTEXT                                         \
++           "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
++           "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
++           "call __switch_to\n\t"                                       \
++           THREAD_RETURN_SYM                                            \
++           "movq "__percpu_arg([current_task])",%%rsi\n\t"              \
++           __switch_canary                                              \
++           "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
++           "movq %%rax,%%rdi\n\t"                                       \
++           "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"                 \
++           "jnz   ret_from_fork\n\t"                                    \
++           RESTORE_CONTEXT                                              \
++           : "=a" (last)                                                \
++             __switch_canary_oparam                                     \
++           : [next] "S" (next), [prev] "D" (prev),                      \
++             [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
++             [ti_flags] "i" (offsetof(struct thread_info, flags)),      \
++             [_tif_fork] "i" (_TIF_FORK),                               \
++             [thread_info] "i" (offsetof(struct task_struct, stack)),   \
++             [current_task] "m" (current_task)                          \
++             __switch_canary_iparam                                     \
++           : "memory", "cc" __EXTRA_CLOBBER)
++#endif
++
++#ifdef __KERNEL__
++
++extern void xen_load_gs_index(unsigned);
++
++/*
++ * Load a segment. Fall back on loading the zero
++ * segment if something goes wrong..
++ */
++#define loadsegment(seg, value)                                               \
++do {                                                                  \
++      unsigned short __val = (value);                                 \
++                                                                      \
++      asm volatile("                                          \n"     \
++                   "1:        movl %k0,%%" #seg "             \n"     \
++                                                                      \
++                   ".section .fixup,\"ax\"                    \n"     \
++                   "2:        xorl %k0,%k0                    \n"     \
++                   "          jmp 1b                          \n"     \
++                   ".previous                                 \n"     \
++                                                                      \
++                   _ASM_EXTABLE(1b, 2b)                               \
++                                                                      \
++                   : "+r" (__val) : : "memory");                      \
++} while (0)
++
++/*
++ * Save a segment register away
++ */
++#define savesegment(seg, value)                               \
++      asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
++
++/*
++ * x86_32 user gs accessors.
++ */
++#ifdef CONFIG_X86_32
++#ifdef CONFIG_X86_32_LAZY_GS
++#define get_user_gs(regs)     (u16)({unsigned long v; savesegment(gs, v); v;})
++#define set_user_gs(regs, v)  loadsegment(gs, (unsigned long)(v))
++#define task_user_gs(tsk)     ((tsk)->thread.gs)
++#define lazy_save_gs(v)               savesegment(gs, (v))
++#define lazy_load_gs(v)               loadsegment(gs, (v))
++#else /* X86_32_LAZY_GS */
++#define get_user_gs(regs)     (u16)((regs)->gs)
++#define set_user_gs(regs, v)  do { (regs)->gs = (v); } while (0)
++#define task_user_gs(tsk)     (task_pt_regs(tsk)->gs)
++#define lazy_save_gs(v)               do { } while (0)
++#define lazy_load_gs(v)               do { } while (0)
++#endif        /* X86_32_LAZY_GS */
++#endif        /* X86_32 */
++
++static inline unsigned long get_limit(unsigned long segment)
++{
++      unsigned long __limit;
++      asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
++      return __limit + 1;
++}
++
++static inline void xen_clts(void)
++{
++      HYPERVISOR_fpu_taskswitch(0);
++}
++
++static inline void xen_stts(void)
++{
++      HYPERVISOR_fpu_taskswitch(1);
++}
++
++/*
++ * Volatile isn't enough to prevent the compiler from reordering the
++ * read/write functions for the control registers and messing everything up.
++ * A memory clobber would solve the problem, but would prevent reordering of
++ * all loads stores around it, which can hurt performance. Solution is to
++ * use a variable and mimic reads and writes to it to enforce serialization
++ */
++static unsigned long __force_order;
++
++static inline unsigned long xen_read_cr0(void)
++{
++      unsigned long val;
++      asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
++      return val;
++}
++
++static inline void xen_write_cr0(unsigned long val)
++{
++      asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
++}
++
++#define xen_read_cr2() vcpu_info_read(arch.cr2)
++#define xen_write_cr2(val) vcpu_info_write(arch.cr2, val)
++
++static inline unsigned long xen_read_cr3(void)
++{
++      unsigned long val;
++      asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
++#ifdef CONFIG_X86_32
++      return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
++#else
++      return machine_to_phys(val);
++#endif
++}
++
++static inline void xen_write_cr3(unsigned long val)
++{
++#ifdef CONFIG_X86_32
++      val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
++#else
++      val = phys_to_machine(val);
++#endif
++      asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
++}
++
++static inline unsigned long xen_read_cr4(void)
++{
++      unsigned long val;
++      asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
++      return val;
++}
++
++#define xen_read_cr4_safe() xen_read_cr4()
++
++static inline void xen_write_cr4(unsigned long val)
++{
++      asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
++}
++
++#ifdef CONFIG_X86_64
++static inline unsigned long xen_read_cr8(void)
++{
++      return 0;
++}
++
++static inline void xen_write_cr8(unsigned long val)
++{
++      BUG_ON(val);
++}
++#endif
++
++static inline void xen_wbinvd(void)
++{
++      asm volatile("wbinvd": : :"memory");
++}
++
++#define read_cr0()    (xen_read_cr0())
++#define write_cr0(x)  (xen_write_cr0(x))
++#define read_cr2()    (xen_read_cr2())
++#define write_cr2(x)  (xen_write_cr2(x))
++#define read_cr3()    (xen_read_cr3())
++#define write_cr3(x)  (xen_write_cr3(x))
++#define read_cr4()    (xen_read_cr4())
++#define read_cr4_safe()       (xen_read_cr4_safe())
++#define write_cr4(x)  (xen_write_cr4(x))
++#define wbinvd()      (xen_wbinvd())
++#ifdef CONFIG_X86_64
++#define read_cr8()    (xen_read_cr8())
++#define write_cr8(x)  (xen_write_cr8(x))
++#define load_gs_index   xen_load_gs_index
++#endif
++
++/* Clear the 'TS' bit */
++#define clts()                (xen_clts())
++#define stts()                (xen_stts())
++
++#endif /* __KERNEL__ */
++
++static inline void clflush(volatile void *__p)
++{
++      asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
++}
++
++#define nop() asm volatile ("nop")
++
++void disable_hlt(void);
++void enable_hlt(void);
++
++void cpu_idle_wait(void);
++
++extern unsigned long arch_align_stack(unsigned long sp);
++extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
++
++void xen_idle(void);
++
++void stop_this_cpu(void *dummy);
++
++/*
++ * Force strict CPU ordering.
++ * And yes, this is required on UP too when we're talking
++ * to devices.
++ */
++#ifdef CONFIG_X86_32
++/*
++ * Some non-Intel clones support out of order store. wmb() ceases to be a
++ * nop for these.
++ */
++#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
++#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
++#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
++#else
++#define mb()  asm volatile("mfence":::"memory")
++#define rmb() asm volatile("lfence":::"memory")
++#define wmb() asm volatile("sfence" ::: "memory")
++#endif
++
++/**
++ * read_barrier_depends - Flush all pending reads that subsequents reads
++ * depend on.
++ *
++ * No data-dependent reads from memory-like regions are ever reordered
++ * over this barrier.  All reads preceding this primitive are guaranteed
++ * to access memory (but not necessarily other CPUs' caches) before any
++ * reads following this primitive that depend on the data return by
++ * any of the preceding reads.  This primitive is much lighter weight than
++ * rmb() on most CPUs, and is never heavier weight than is
++ * rmb().
++ *
++ * These ordering constraints are respected by both the local CPU
++ * and the compiler.
++ *
++ * Ordering is not guaranteed by anything other than these primitives,
++ * not even by data dependencies.  See the documentation for
++ * memory_barrier() for examples and URLs to more information.
++ *
++ * For example, the following code would force ordering (the initial
++ * value of "a" is zero, "b" is one, and "p" is "&a"):
++ *
++ * <programlisting>
++ *    CPU 0                           CPU 1
++ *
++ *    b = 2;
++ *    memory_barrier();
++ *    p = &b;                         q = p;
++ *                                    read_barrier_depends();
++ *                                    d = *q;
++ * </programlisting>
++ *
++ * because the read of "*q" depends on the read of "p" and these
++ * two reads are separated by a read_barrier_depends().  However,
++ * the following code, with the same initial values for "a" and "b":
++ *
++ * <programlisting>
++ *    CPU 0                           CPU 1
++ *
++ *    a = 2;
++ *    memory_barrier();
++ *    b = 3;                          y = b;
++ *                                    read_barrier_depends();
++ *                                    x = a;
++ * </programlisting>
++ *
++ * does not enforce ordering, since there is no data dependency between
++ * the read of "a" and the read of "b".  Therefore, on some CPUs, such
++ * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
++ * in cases like this where there are no data dependencies.
++ **/
++
++#define read_barrier_depends()        do { } while (0)
++
++#ifdef CONFIG_SMP
++#define smp_mb()      mb()
++#ifdef CONFIG_X86_PPRO_FENCE
++# define smp_rmb()    rmb()
++#else
++# define smp_rmb()    barrier()
++#endif
++#ifdef CONFIG_X86_OOSTORE
++# define smp_wmb()    wmb()
++#else
++# define smp_wmb()    barrier()
++#endif
++#define smp_read_barrier_depends()    read_barrier_depends()
++#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
++#else
++#define smp_mb()      barrier()
++#define smp_rmb()     barrier()
++#define smp_wmb()     barrier()
++#define smp_read_barrier_depends()    do { } while (0)
++#define set_mb(var, value) do { var = value; barrier(); } while (0)
++#endif
++
++/*
++ * Stop RDTSC speculation. This is needed when you need to use RDTSC
++ * (or get_cycles or vread that possibly accesses the TSC) in a defined
++ * code region.
++ *
++ * (Could use an alternative three way for this if there was one.)
++ */
++static __always_inline void rdtsc_barrier(void)
++{
++      alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
++      alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
++}
++
++/*
++ * We handle most unaligned accesses in hardware.  On the other hand
++ * unaligned DMA can be quite expensive on some Nehalem processors.
++ *
++ * Based on this we disable the IP header alignment in network drivers.
++ */
++#define NET_IP_ALIGN  0
++#endif /* _ASM_X86_SYSTEM_H */
diff --cc arch/x86/include/mach-xen/asm/tlbflush.h

index 0000000,0000000..0dc6dd6

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/tlbflush.h
@@@ -1,0 -1,0 +1,114 @@@
++#ifndef _ASM_X86_TLBFLUSH_H
++#define _ASM_X86_TLBFLUSH_H
++
++#include <linux/mm.h>
++#include <linux/sched.h>
++
++#include <asm/processor.h>
++#include <asm/system.h>
++
++#define __flush_tlb() xen_tlb_flush()
++#define __flush_tlb_global() xen_tlb_flush()
++#define __flush_tlb_single(addr) xen_invlpg(addr)
++#define __flush_tlb_all() xen_tlb_flush()
++#define __flush_tlb_one(addr) xen_invlpg(addr)
++
++#ifdef CONFIG_X86_32
++# define TLB_FLUSH_ALL        0xffffffff
++#else
++# define TLB_FLUSH_ALL        -1ULL
++#endif
++
++/*
++ * TLB flushing:
++ *
++ *  - flush_tlb() flushes the current mm struct TLBs
++ *  - flush_tlb_all() flushes all processes TLBs
++ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
++ *  - flush_tlb_page(vma, vmaddr) flushes one page
++ *  - flush_tlb_range(vma, start, end) flushes a range of pages
++ *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
++ *
++ * ..but the i386 has somewhat limited tlb flushing capabilities,
++ * and page-granular flushes are available only on i486 and up.
++ *
++ * x86-64 can only flush individual pages or full VMs. For a range flush
++ * we always do the full VM. Might be worth trying if for a small
++ * range a few INVLPGs in a row are a win.
++ */
++
++#ifndef CONFIG_SMP
++
++#define flush_tlb() __flush_tlb()
++#define flush_tlb_all() __flush_tlb_all()
++#define local_flush_tlb() __flush_tlb()
++
++static inline void flush_tlb_mm(struct mm_struct *mm)
++{
++      if (mm == current->active_mm)
++              __flush_tlb();
++}
++
++static inline void flush_tlb_page(struct vm_area_struct *vma,
++                                unsigned long addr)
++{
++      if (vma->vm_mm == current->active_mm)
++              __flush_tlb_one(addr);
++}
++
++static inline void flush_tlb_range(struct vm_area_struct *vma,
++                                 unsigned long start, unsigned long end)
++{
++      if (vma->vm_mm == current->active_mm)
++              __flush_tlb();
++}
++
++static inline void reset_lazy_tlbstate(void)
++{
++}
++
++#else  /* SMP */
++
++#include <asm/smp.h>
++
++#define local_flush_tlb() __flush_tlb()
++
++#define flush_tlb_all xen_tlb_flush_all
++#define flush_tlb_current_task() xen_tlb_flush_mask(mm_cpumask(current->mm))
++#define flush_tlb_mm(mm) xen_tlb_flush_mask(mm_cpumask(mm))
++#define flush_tlb_page(vma, va) xen_invlpg_mask(mm_cpumask((vma)->vm_mm), va)
++
++#define flush_tlb()   flush_tlb_current_task()
++
++static inline void flush_tlb_range(struct vm_area_struct *vma,
++                                 unsigned long start, unsigned long end)
++{
++      flush_tlb_mm(vma->vm_mm);
++}
++
++#ifndef CONFIG_XEN
++#define TLBSTATE_OK   1
++#define TLBSTATE_LAZY 2
++
++struct tlb_state {
++      struct mm_struct *active_mm;
++      int state;
++};
++DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
++
++static inline void reset_lazy_tlbstate(void)
++{
++      percpu_write(cpu_tlbstate.state, 0);
++      percpu_write(cpu_tlbstate.active_mm, &init_mm);
++}
++#endif
++
++#endif        /* SMP */
++
++static inline void flush_tlb_kernel_range(unsigned long start,
++                                        unsigned long end)
++{
++      flush_tlb_all();
++}
++
++#endif /* _ASM_X86_TLBFLUSH_H */
diff --cc arch/x86/include/mach-xen/asm/vga.h

index 0000000,0000000..fe4a3c4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/vga.h
@@@ -1,0 -1,0 +1,20 @@@
++/*
++ *    Access to VGA videoram
++ *
++ *    (c) 1998 Martin Mares <mj@ucw.cz>
++ */
++
++#ifndef _ASM_X86_VGA_H
++#define _ASM_X86_VGA_H
++
++/*
++ *    On the PC, we can just recalculate addresses and then
++ *    access the videoram directly without any black magic.
++ */
++
++#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x)
++
++#define vga_readb(x) (*(x))
++#define vga_writeb(x, y) (*(y) = (x))
++
++#endif /* _ASM_X86_VGA_H */
diff --cc arch/x86/include/mach-xen/asm/xenoprof.h

index 0000000,0000000..2733e00

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/xenoprof.h
@@@ -1,0 -1,0 +1,48 @@@
++/******************************************************************************
++ * asm-i386/mach-xen/asm/xenoprof.h
++ *
++ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
++ *                    VA Linux Systems Japan K.K.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ *
++ */
++#ifndef __ASM_XENOPROF_H__
++#define __ASM_XENOPROF_H__
++#ifdef CONFIG_XEN
++
++struct super_block;
++struct dentry;
++int xenoprof_create_files(struct super_block * sb, struct dentry * root);
++#define HAVE_XENOPROF_CREATE_FILES
++
++struct xenoprof_init;
++void xenoprof_arch_init_counter(struct xenoprof_init *init);
++void xenoprof_arch_counter(void);
++void xenoprof_arch_start(void);
++void xenoprof_arch_stop(void);
++
++struct xenoprof_arch_shared_buffer {
++      /* nothing */
++};
++struct xenoprof_shared_buffer;
++void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
++struct xenoprof_get_buffer;
++int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
++struct xenoprof_passive;
++int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
++
++#endif /* CONFIG_XEN */
++#endif /* __ASM_XENOPROF_H__ */
diff --cc arch/x86/include/mach-xen/asm/xor.h

index 0000000,0000000..edb08e6

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/xor.h
@@@ -1,0 -1,0 +1,8 @@@
++#ifdef CONFIG_KMEMCHECK
++/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
++# include <asm-generic/xor.h>
++#elif defined(CONFIG_X86_32)
++# include "../../asm/xor_32.h"
++#else
++# include "xor_64.h"
++#endif
diff --cc arch/x86/include/mach-xen/asm/xor_64.h

index 0000000,0000000..d0ad82d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/xor_64.h
@@@ -1,0 -1,0 +1,337 @@@
++#ifndef _ASM_X86_XOR_64_H
++#define _ASM_X86_XOR_64_H
++
++/*
++ * x86-64 changes / gcc fixes from Andi Kleen.
++ * Copyright 2002 Andi Kleen, SuSE Labs.
++ *
++ * This hasn't been optimized for the hammer yet, but there are likely
++ * no advantages to be gotten from x86-64 here anyways.
++ */
++
++typedef struct {
++      unsigned long a, b;
++} __attribute__((aligned(16))) xmm_store_t;
++
++/* Doesn't use gcc to save the XMM registers, because there is no easy way to
++   tell it to do a clts before the register saving. */
++#define XMMS_SAVE                             \
++do {                                          \
++      preempt_disable();                      \
++      if (!(current_thread_info()->status & TS_USEDFPU))      \
++              clts();                         \
++      asm volatile(                           \
++              "movups %%xmm0,(%1)     ;\n\t"  \
++              "movups %%xmm1,0x10(%1) ;\n\t"  \
++              "movups %%xmm2,0x20(%1) ;\n\t"  \
++              "movups %%xmm3,0x30(%1) ;\n\t"  \
++              : "=&r" (cr0)                   \
++              : "r" (xmm_save)                \
++              : "memory");                    \
++} while (0)
++
++#define XMMS_RESTORE                          \
++do {                                          \
++      asm volatile(                           \
++              "sfence                 ;\n\t"  \
++              "movups (%1),%%xmm0     ;\n\t"  \
++              "movups 0x10(%1),%%xmm1 ;\n\t"  \
++              "movups 0x20(%1),%%xmm2 ;\n\t"  \
++              "movups 0x30(%1),%%xmm3 ;\n\t"  \
++              :                               \
++              : "r" (cr0), "r" (xmm_save)     \
++              : "memory");                    \
++      if (!(current_thread_info()->status & TS_USEDFPU))      \
++              stts();                         \
++      preempt_enable();                       \
++} while (0)
++
++#define OFFS(x)               "16*("#x")"
++#define PF_OFFS(x)    "256+16*("#x")"
++#define       PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
++#define LD(x, y)      "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
++#define ST(x, y)      "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
++#define PF1(x)                "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
++#define PF2(x)                "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
++#define PF3(x)                "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
++#define PF4(x)                "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
++#define PF5(x)                "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
++#define XO1(x, y)     "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
++#define XO2(x, y)     "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
++#define XO3(x, y)     "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
++#define XO4(x, y)     "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
++#define XO5(x, y)     "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
++
++
++static void
++xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++{
++      unsigned int lines = bytes >> 8;
++      unsigned long cr0;
++      xmm_store_t xmm_save[4];
++
++      XMMS_SAVE;
++
++      asm volatile(
++#undef BLOCK
++#define BLOCK(i) \
++              LD(i, 0)                                \
++                      LD(i + 1, 1)                    \
++              PF1(i)                                  \
++                              PF1(i + 2)              \
++                              LD(i + 2, 2)            \
++                                      LD(i + 3, 3)    \
++              PF0(i + 4)                              \
++                              PF0(i + 6)              \
++              XO1(i, 0)                               \
++                      XO1(i + 1, 1)                   \
++                              XO1(i + 2, 2)           \
++                                      XO1(i + 3, 3)   \
++              ST(i, 0)                                \
++                      ST(i + 1, 1)                    \
++                              ST(i + 2, 2)            \
++                                      ST(i + 3, 3)    \
++
++
++              PF0(0)
++                              PF0(2)
++
++      " .align 32                     ;\n"
++      " 1:                            ;\n"
++
++              BLOCK(0)
++              BLOCK(4)
++              BLOCK(8)
++              BLOCK(12)
++
++      "       addq %[inc], %[p1]           ;\n"
++      "       addq %[inc], %[p2]           ;\n"
++              "               decl %[cnt] ; jnz 1b"
++      : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
++      : [inc] "r" (256UL)
++      : "memory");
++
++      XMMS_RESTORE;
++}
++
++static void
++xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
++        unsigned long *p3)
++{
++      unsigned int lines = bytes >> 8;
++      xmm_store_t xmm_save[4];
++      unsigned long cr0;
++
++      XMMS_SAVE;
++
++      asm volatile(
++#undef BLOCK
++#define BLOCK(i) \
++              PF1(i)                                  \
++                              PF1(i + 2)              \
++              LD(i, 0)                                        \
++                      LD(i + 1, 1)                    \
++                              LD(i + 2, 2)            \
++                                      LD(i + 3, 3)    \
++              PF2(i)                                  \
++                              PF2(i + 2)              \
++              PF0(i + 4)                              \
++                              PF0(i + 6)              \
++              XO1(i, 0)                               \
++                      XO1(i + 1, 1)                   \
++                              XO1(i + 2, 2)           \
++                                      XO1(i + 3, 3)   \
++              XO2(i, 0)                               \
++                      XO2(i + 1, 1)                   \
++                              XO2(i + 2, 2)           \
++                                      XO2(i + 3, 3)   \
++              ST(i, 0)                                \
++                      ST(i + 1, 1)                    \
++                              ST(i + 2, 2)            \
++                                      ST(i + 3, 3)    \
++
++
++              PF0(0)
++                              PF0(2)
++
++      " .align 32                     ;\n"
++      " 1:                            ;\n"
++
++              BLOCK(0)
++              BLOCK(4)
++              BLOCK(8)
++              BLOCK(12)
++
++      "       addq %[inc], %[p1]           ;\n"
++      "       addq %[inc], %[p2]          ;\n"
++      "       addq %[inc], %[p3]           ;\n"
++              "               decl %[cnt] ; jnz 1b"
++      : [cnt] "+r" (lines),
++        [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
++      : [inc] "r" (256UL)
++      : "memory");
++      XMMS_RESTORE;
++}
++
++static void
++xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
++        unsigned long *p3, unsigned long *p4)
++{
++      unsigned int lines = bytes >> 8;
++      xmm_store_t xmm_save[4];
++      unsigned long cr0;
++
++      XMMS_SAVE;
++
++      asm volatile(
++#undef BLOCK
++#define BLOCK(i) \
++              PF1(i)                                  \
++                              PF1(i + 2)              \
++              LD(i, 0)                                \
++                      LD(i + 1, 1)                    \
++                              LD(i + 2, 2)            \
++                                      LD(i + 3, 3)    \
++              PF2(i)                                  \
++                              PF2(i + 2)              \
++              XO1(i, 0)                               \
++                      XO1(i + 1, 1)                   \
++                              XO1(i + 2, 2)           \
++                                      XO1(i + 3, 3)   \
++              PF3(i)                                  \
++                              PF3(i + 2)              \
++              PF0(i + 4)                              \
++                              PF0(i + 6)              \
++              XO2(i, 0)                               \
++                      XO2(i + 1, 1)                   \
++                              XO2(i + 2, 2)           \
++                                      XO2(i + 3, 3)   \
++              XO3(i, 0)                               \
++                      XO3(i + 1, 1)                   \
++                              XO3(i + 2, 2)           \
++                                      XO3(i + 3, 3)   \
++              ST(i, 0)                                \
++                      ST(i + 1, 1)                    \
++                              ST(i + 2, 2)            \
++                                      ST(i + 3, 3)    \
++
++
++              PF0(0)
++                              PF0(2)
++
++      " .align 32                     ;\n"
++      " 1:                            ;\n"
++
++              BLOCK(0)
++              BLOCK(4)
++              BLOCK(8)
++              BLOCK(12)
++
++      "       addq %[inc], %[p1]           ;\n"
++      "       addq %[inc], %[p2]           ;\n"
++      "       addq %[inc], %[p3]           ;\n"
++      "       addq %[inc], %[p4]           ;\n"
++      "       decl %[cnt] ; jnz 1b"
++      : [cnt] "+c" (lines),
++        [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
++      : [inc] "r" (256UL)
++      : "memory" );
++
++      XMMS_RESTORE;
++}
++
++static void
++xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
++        unsigned long *p3, unsigned long *p4, unsigned long *p5)
++{
++      unsigned int lines = bytes >> 8;
++      xmm_store_t xmm_save[4];
++      unsigned long cr0;
++
++      XMMS_SAVE;
++
++      asm volatile(
++#undef BLOCK
++#define BLOCK(i) \
++              PF1(i)                                  \
++                              PF1(i + 2)              \
++              LD(i, 0)                                \
++                      LD(i + 1, 1)                    \
++                              LD(i + 2, 2)            \
++                                      LD(i + 3, 3)    \
++              PF2(i)                                  \
++                              PF2(i + 2)              \
++              XO1(i, 0)                               \
++                      XO1(i + 1, 1)                   \
++                              XO1(i + 2, 2)           \
++                                      XO1(i + 3, 3)   \
++              PF3(i)                                  \
++                              PF3(i + 2)              \
++              XO2(i, 0)                               \
++                      XO2(i + 1, 1)                   \
++                              XO2(i + 2, 2)           \
++                                      XO2(i + 3, 3)   \
++              PF4(i)                                  \
++                              PF4(i + 2)              \
++              PF0(i + 4)                              \
++                              PF0(i + 6)              \
++              XO3(i, 0)                               \
++                      XO3(i + 1, 1)                   \
++                              XO3(i + 2, 2)           \
++                                      XO3(i + 3, 3)   \
++              XO4(i, 0)                               \
++                      XO4(i + 1, 1)                   \
++                              XO4(i + 2, 2)           \
++                                      XO4(i + 3, 3)   \
++              ST(i, 0)                                \
++                      ST(i + 1, 1)                    \
++                              ST(i + 2, 2)            \
++                                      ST(i + 3, 3)    \
++
++
++              PF0(0)
++                              PF0(2)
++
++      " .align 32                     ;\n"
++      " 1:                            ;\n"
++
++              BLOCK(0)
++              BLOCK(4)
++              BLOCK(8)
++              BLOCK(12)
++
++      "       addq %[inc], %[p1]           ;\n"
++      "       addq %[inc], %[p2]           ;\n"
++      "       addq %[inc], %[p3]           ;\n"
++      "       addq %[inc], %[p4]           ;\n"
++      "       addq %[inc], %[p5]           ;\n"
++      "       decl %[cnt] ; jnz 1b"
++      : [cnt] "+c" (lines),
++        [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
++        [p5] "+r" (p5)
++      : [inc] "r" (256UL)
++      : "memory");
++
++      XMMS_RESTORE;
++}
++
++static struct xor_block_template xor_block_sse = {
++      .name = "generic_sse",
++      .do_2 = xor_sse_2,
++      .do_3 = xor_sse_3,
++      .do_4 = xor_sse_4,
++      .do_5 = xor_sse_5,
++};
++
++#undef XOR_TRY_TEMPLATES
++#define XOR_TRY_TEMPLATES                     \
++do {                                          \
++      xor_speed(&xor_block_sse);              \
++} while (0)
++
++/* We force the use of the SSE xor block because it can write around L2.
++   We may also be able to load into the L1 only depending on how the cpu
++   deals with a load to a line that is being prefetched.  */
++#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
++
++#endif /* _ASM_X86_XOR_64_H */
diff --cc arch/x86/kernel/Makefile

index f5abe3a,7338ef2..ac52fd0
--- 1/arch/x86/kernel/Makefile
--- 2/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@@ -114,6 -112,6 +112,8 @@@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION
   obj-$(CONFIG_SWIOTLB)                 += pci-swiotlb.o
   obj-$(CONFIG_OF)                      += devicetree.o
   
++obj-$(CONFIG_X86_XEN)         += fixup.o
++
   ###
   # 64 bit specific files
   ifeq ($(CONFIG_X86_64),y)
@@@ -126,3 -124,3 +126,7 @@@
         obj-$(CONFIG_PCI_MMCONFIG)      += mmconf-fam10h_64.o
         obj-y                           += vsmp_64.o
   endif
++
++disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8253.o i8259.o \
++      irqinit.o pci-swiotlb.o reboot.o smpboot.o trampoline%.o tsc.o tsc_sync.o vsmp_64.o
++disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms_32.o
diff --cc arch/x86/kernel/acpi/Makefile

index 6f35260,6f35260..528e3de
--- 1/arch/x86/kernel/acpi/Makefile
--- 2/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@@ -5,6 -5,6 +5,9 @@@ obj-$(CONFIG_ACPI_SLEEP) += sleep.o wak
   
   ifneq ($(CONFIG_ACPI_PROCESSOR),)
   obj-y                         += cstate.o
++ifneq ($(CONFIG_PROCESSOR_EXTERNAL_CONTROL),)
++obj-$(CONFIG_XEN)             += processor_extcntl_xen.o
++endif
   endif
   
   $(obj)/wakeup_rm.o:    $(obj)/realmode/wakeup.bin
@@@ -12,3 -12,3 +15,4 @@@
   $(obj)/realmode/wakeup.bin: FORCE
         $(Q)$(MAKE) $(build)=$(obj)/realmode
   
++disabled-obj-$(CONFIG_XEN)    := cstate.o sleep.o wakeup_%.o
diff --cc arch/x86/kernel/acpi/boot.c

index d246e74,9a966c5..ca873e6
--- 1/arch/x86/kernel/acpi/boot.c
--- 2/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@@ -70,6 -70,6 +70,7 @@@ int acpi_strict
   
   u8 acpi_sci_flags __initdata;
   int acpi_sci_override_gsi __initdata;
++#ifndef CONFIG_XEN
   int acpi_skip_timer_override __initdata;
   int acpi_use_timer_override __initdata;
   int acpi_fix_pin2_polarity __initdata;
@@@ -77,6 -77,6 +78,10 @@@
   #ifdef CONFIG_X86_LOCAL_APIC
   static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
   #endif
++#else
++#define acpi_skip_timer_override 0
++#define acpi_fix_pin2_polarity 0
++#endif
   
   #ifndef __HAVE_ARCH_CMPXCHG
   #warning ACPI uses CMPXCHG, i486 and later hardware
@@@ -182,6 -182,6 +187,7 @@@ static int __init acpi_parse_madt(struc
                 return -ENODEV;
         }
   
++#ifndef CONFIG_XEN
         if (madt->address) {
                 acpi_lapic_addr = (u64) madt->address;
   
@@@ -191,12 -191,12 +197,14 @@@
   
         default_acpi_madt_oem_check(madt->header.oem_id,
                                     madt->header.oem_table_id);
++#endif
   
         return 0;
   }
   
   static void __cpuinit acpi_register_lapic(int id, u8 enabled)
   {
++#ifndef CONFIG_XEN
         unsigned int ver = 0;
   
         if (id >= (MAX_LOCAL_APIC-1)) {
@@@ -213,6 -213,6 +221,7 @@@
                 ver = apic_version[boot_cpu_physical_apicid];
   
         generic_processor_info(id, ver);
++#endif
   }
   
   static int __init
@@@ -291,6 -291,6 +300,7 @@@ static int __ini
   acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
                           const unsigned long end)
   {
++#ifndef CONFIG_XEN
         struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
   
         lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header;
@@@ -299,6 -299,6 +309,7 @@@
                 return -EINVAL;
   
         acpi_lapic_addr = lapic_addr_ovr->address;
++#endif
   
         return 0;
   }
@@@ -587,6 -587,6 +598,7 @@@ void __init acpi_set_irq_model_ioapic(v
   #ifdef CONFIG_ACPI_HOTPLUG_CPU
   #include <acpi/processor.h>
   
++#ifndef CONFIG_XEN
   static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
   {
   #ifdef CONFIG_ACPI_NUMA
@@@ -672,6 -672,6 +684,9 @@@ free_tmp_map
   out:
         return retval;
   }
++#else
++#define _acpi_map_lsapic(h, p) (-EINVAL)
++#endif
   
   /* wrapper to silence section mismatch warning */
   int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
@@@ -682,9 -682,9 +697,11 @@@ EXPORT_SYMBOL(acpi_map_lsapic)
   
   int acpi_unmap_lsapic(int cpu)
   {
++#ifndef CONFIG_XEN
         per_cpu(x86_cpu_to_apicid, cpu) = -1;
         set_cpu_present(cpu, false);
         num_processors--;
++#endif
   
         return (0);
   }
@@@ -1326,6 -1326,6 +1343,7 @@@ static int __init dmi_disable_acpi(cons
         return 0;
   }
   
++#ifndef CONFIG_XEN
   /*
    * Force ignoring BIOS IRQ0 pin2 override
    */
@@@ -1343,21 -1343,6 +1361,22 @@@ static int __init dmi_ignore_irq0_timer
         }
         return 0;
   }
++#endif
+ +
+ +static int __init force_acpi_rsdt(const struct dmi_system_id *d)
+ +{
+ +      if (!acpi_force) {
+ +              printk(KERN_NOTICE "%s detected: force use of acpi=rsdt\n",
+ +                     d->ident);
+ +              acpi_rsdt_forced = 1;
+ +      } else {
+ +              printk(KERN_NOTICE
+ +                     "Warning: acpi=force overrules DMI blacklist: "
+ +                     "acpi=rsdt\n");
+ +      }
+ +      return 0;
+ +
+ +}
   
   /*
    * If your system is blacklisted here, but you find that acpi=force
@@@ -1463,6 -1422,6 +1482,7 @@@ static struct dmi_system_id __initdata 
         {}
   };
   
++#ifndef CONFIG_XEN
   /* second table for DMI checks that should run after early-quirks */
   static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
         /*
@@@ -1509,6 -1468,6 +1529,7 @@@
          },
         {}
   };
++#endif
   
   /*
    * acpi_boot_table_init() and acpi_boot_init()
@@@ -1581,8 -1540,8 +1602,10 @@@ int __init early_acpi_boot_init(void
   
   int __init acpi_boot_init(void)
   {
++#ifndef CONFIG_XEN
         /* those are executed after early-quirks are executed */
         dmi_check_system(acpi_dmi_table_late);
++#endif
   
         /*
          * If acpi_disabled, bail out
@@@ -1682,7 -1629,7 +1705,7 @@@ int __init acpi_mps_check(void
         return 0;
   }
   
--#ifdef CONFIG_X86_IO_APIC
++#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
   static int __init parse_acpi_skip_timer_override(char *arg)
   {
         acpi_skip_timer_override = 1;
diff --cc arch/x86/kernel/acpi/processor_extcntl_xen.c

index 0000000,0000000..6e7cf2d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/acpi/processor_extcntl_xen.c
@@@ -1,0 -1,0 +1,281 @@@
++/*
++ * processor_extcntl_xen.c - interface to notify Xen
++ *
++ *  Copyright (C) 2008, Intel corporation
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or (at
++ *  your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful, but
++ *  WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ *  General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License along
++ *  with this program; if not, write to the Free Software Foundation, Inc.,
++ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
++ *
++ */
++
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/acpi.h>
++#include <linux/pm.h>
++#include <linux/cpu.h>
++
++#include <linux/cpufreq.h>
++#include <acpi/processor.h>
++#include <asm/hypercall.h>
++
++static int xen_cx_notifier(struct acpi_processor *pr, int action)
++{
++      int ret, count = 0, i;
++      xen_platform_op_t op = {
++              .cmd                    = XENPF_set_processor_pminfo,
++              .interface_version      = XENPF_INTERFACE_VERSION,
++              .u.set_pminfo.id        = pr->acpi_id,
++              .u.set_pminfo.type      = XEN_PM_CX,
++      };
++      struct xen_processor_cx *data, *buf;
++      struct acpi_processor_cx *cx;
++
++      /* Convert to Xen defined structure and hypercall */
++      buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
++                      GFP_KERNEL);
++      if (!buf)
++              return -ENOMEM;
++
++      data = buf;
++      for (i = 1; i <= pr->power.count; i++) {
++              cx = &pr->power.states[i];
++              /* Skip invalid cstate entry */
++              if (!cx->valid)
++                      continue;
++
++              data->type = cx->type;
++              data->latency = cx->latency;
++              data->power = cx->power;
++              data->reg.space_id = cx->reg.space_id;
++              data->reg.bit_width = cx->reg.bit_width;
++              data->reg.bit_offset = cx->reg.bit_offset;
++              data->reg.access_size = cx->reg.access_size;
++              data->reg.address = cx->reg.address;
++
++              /* Get dependency relationships */
++              if (cx->csd_count) {
++                      pr_warning("_CSD found: Not supported for now!\n");
++                      kfree(buf);
++                      return -EINVAL;
++              } else {
++                      data->dpcnt = 0;
++                      set_xen_guest_handle(data->dp, NULL);
++              }
++
++              data++;
++              count++;
++      }
++
++      if (!count) {
++              pr_info("No available Cx info for cpu %d\n", pr->acpi_id);
++              kfree(buf);
++              return -EINVAL;
++      }
++
++      op.u.set_pminfo.u.power.count = count;
++      op.u.set_pminfo.u.power.flags.bm_control = pr->flags.bm_control;
++      op.u.set_pminfo.u.power.flags.bm_check = pr->flags.bm_check;
++      op.u.set_pminfo.u.power.flags.has_cst = pr->flags.has_cst;
++      op.u.set_pminfo.u.power.flags.power_setup_done = pr->flags.power_setup_done;
++
++      set_xen_guest_handle(op.u.set_pminfo.u.power.states, buf);
++      ret = HYPERVISOR_platform_op(&op);
++      kfree(buf);
++      return ret;
++}
++
++static int xen_px_notifier(struct acpi_processor *pr, int action)
++{
++      int ret = -EINVAL;
++      xen_platform_op_t op = {
++              .cmd                    = XENPF_set_processor_pminfo,
++              .interface_version      = XENPF_INTERFACE_VERSION,
++              .u.set_pminfo.id        = pr->acpi_id,
++              .u.set_pminfo.type      = XEN_PM_PX,
++      };
++      struct xen_processor_performance *perf;
++      struct xen_processor_px *states = NULL;
++      struct acpi_processor_performance *px;
++      struct acpi_psd_package *pdomain;
++
++      if (!pr)
++              return -EINVAL;
++
++      perf = &op.u.set_pminfo.u.perf;
++      px = pr->performance;
++      if (!px)
++              return -EINVAL;
++
++      switch(action) {
++      case PROCESSOR_PM_CHANGE:
++              /* ppc dynamic handle */
++              perf->flags = XEN_PX_PPC;
++              perf->platform_limit = pr->performance_platform_limit;
++
++              ret = HYPERVISOR_platform_op(&op);
++              break;
++
++      case PROCESSOR_PM_INIT:
++              /* px normal init */
++              perf->flags = XEN_PX_PPC | 
++                            XEN_PX_PCT | 
++                            XEN_PX_PSS | 
++                            XEN_PX_PSD;
++
++              /* ppc */
++              perf->platform_limit = pr->performance_platform_limit;
++
++              /* pct */
++              xen_convert_pct_reg(&perf->control_register, &px->control_register);
++              xen_convert_pct_reg(&perf->status_register, &px->status_register);
++
++              /* pss */
++              perf->state_count = px->state_count;
++              states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL);
++              if (!states)
++                      return -ENOMEM;
++              xen_convert_pss_states(states, px->states, px->state_count);
++              set_xen_guest_handle(perf->states, states);
++
++              /* psd */
++              pdomain = &px->domain_info;
++              xen_convert_psd_pack(&perf->domain_info, pdomain);
++              if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
++                      perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
++              else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
++                      perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
++              else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
++                      perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
++              else {
++                      ret = -ENODEV;
++                      kfree(states);
++                      break;
++              }
++
++              ret = HYPERVISOR_platform_op(&op);
++              kfree(states);
++              break;
++
++      default:
++              break;
++      }
++
++      return ret;
++}
++
++static int xen_tx_notifier(struct acpi_processor *pr, int action)
++{
++      return -EINVAL;
++}
++
++static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
++{
++      int ret = -EINVAL;
++#ifdef CONFIG_ACPI_HOTPLUG_CPU
++      acpi_status status = 0;
++      acpi_object_type type;
++      uint32_t apic_id;
++      int device_decl = 0;
++      unsigned long long pxm;
++      xen_platform_op_t op = {
++              .interface_version  = XENPF_INTERFACE_VERSION,
++      };
++
++      status = acpi_get_type(pr->handle, &type);
++      if (ACPI_FAILURE(status)) {
++              pr_warn("can't get object type for acpi_id %#x\n",
++                      pr->acpi_id);
++              return -ENXIO;
++      }
++
++      switch (type) {
++      case ACPI_TYPE_PROCESSOR:
++              break;
++      case ACPI_TYPE_DEVICE:
++              device_decl = 1;
++              break;
++      default:
++              pr_warn("unsupported object type %#x for acpi_id %#x\n",
++                      type, pr->acpi_id);
++              return -EOPNOTSUPP;
++      }
++
++      apic_id = acpi_get_cpuid(pr->handle, ~device_decl, pr->acpi_id);
++      if (apic_id < 0) {
++              pr_warn("can't get apic_id for acpi_id %#x\n", pr->acpi_id);
++              return -ENODATA;
++      }
++
++      status = acpi_evaluate_integer(pr->handle, "_PXM", NULL, &pxm);
++      if (ACPI_FAILURE(status)) {
++              pr_warn("can't get pxm for acpi_id %#x\n", pr->acpi_id);
++              return -ENODATA;
++      }
++
++      switch (event) {
++      case HOTPLUG_TYPE_ADD:
++              op.cmd = XENPF_cpu_hotadd;
++              op.u.cpu_add.apic_id = apic_id;
++              op.u.cpu_add.acpi_id = pr->acpi_id;
++              op.u.cpu_add.pxm = pxm;
++              ret = HYPERVISOR_platform_op(&op);
++              break;
++      case HOTPLUG_TYPE_REMOVE:
++              pr_warn("Xen doesn't support CPU hot remove\n");
++              ret = -EOPNOTSUPP;
++              break;
++      }
++#endif
++
++      return ret;
++}
++
++static struct processor_extcntl_ops xen_extcntl_ops = {
++      .hotplug                = xen_hotplug_notifier,
++};
++
++static int __init init_extcntl(void)
++{
++      unsigned int pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
++
++#ifndef CONFIG_ACPI_HOTPLUG_CPU
++      if (!pmbits)
++              return 0;
++#endif
++      if (pmbits & XEN_PROCESSOR_PM_CX)
++              xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
++      if (pmbits & XEN_PROCESSOR_PM_PX)
++              xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
++      if (pmbits & XEN_PROCESSOR_PM_TX)
++              xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
++
++      processor_extcntl_ops = &xen_extcntl_ops;
++
++      return 0;
++}
++arch_initcall(init_extcntl);
++
++unsigned int cpufreq_quick_get(unsigned int cpu)
++{
++      xen_platform_op_t op = {
++              .cmd                    = XENPF_get_cpu_freq,
++              .interface_version      = XENPF_INTERFACE_VERSION,
++              .u.get_cpu_freq.vcpu    = cpu
++      };
++
++      return HYPERVISOR_platform_op(&op) == 0 ? op.u.get_cpu_freq.freq : 0;
++}
diff --cc arch/x86/kernel/amd_nb.c

index 4c39baa,4c39baa..458a9cd
--- 1/arch/x86/kernel/amd_nb.c
--- 2/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@@ -15,6 -15,6 +15,10 @@@ static u32 *flush_words
   const struct pci_device_id amd_nb_misc_ids[] = {
         { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
         { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
++#ifdef CONFIG_XEN
++      { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
++      { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, /* Fam12, Fam14 */
++#endif
         { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
         {}
   };
@@@ -119,6 -119,6 +123,7 @@@ bool __init early_is_amd_nb(u32 device
         return false;
   }
   
++#ifndef CONFIG_XEN
   int amd_get_subcaches(int cpu)
   {
         struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
@@@ -177,6 -177,6 +182,7 @@@ int amd_set_subcaches(int cpu, int mask
   
         return 0;
   }
++#endif
   
   static int amd_cache_gart(void)
   {
diff --cc arch/x86/kernel/apic/Makefile

index 767fd04,3966b56..4876eb4
--- 1/arch/x86/kernel/apic/Makefile
--- 2/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@@ -9,18 -9,13 +9,17 @@@ obj-$(CONFIG_X86_IO_APIC)     += io_apic.
   obj-$(CONFIG_SMP)             += ipi.o
   
   ifeq ($(CONFIG_X86_64),y)
- # APIC probe will depend on the listing order here
- obj-$(CONFIG_X86_UV)          += x2apic_uv_x.o
- obj-$(CONFIG_X86_X2APIC)      += x2apic_phys.o
- obj-$(CONFIG_X86_X2APIC)      += x2apic_cluster.o
   obj-y                         += apic_flat_64.o
+ obj-$(CONFIG_X86_X2APIC)      += x2apic_cluster.o
+ obj-$(CONFIG_X86_X2APIC)      += x2apic_phys.o
+ obj-$(CONFIG_X86_UV)          += x2apic_uv_x.o
   endif
   
- # APIC probe will depend on the listing order here
- obj-$(CONFIG_X86_NUMAQ)               += numaq_32.o
- obj-$(CONFIG_X86_SUMMIT)      += summit_32.o
   obj-$(CONFIG_X86_BIGSMP)      += bigsmp_32.o
+ obj-$(CONFIG_X86_NUMAQ)               += numaq_32.o
   obj-$(CONFIG_X86_ES7000)      += es7000_32.o
+ obj-$(CONFIG_X86_SUMMIT)      += summit_32.o
++
++probe_64-$(CONFIG_XEN)                := probe_32.o
+ +
- # For 32bit, probe_32 need to be listed last
- obj-$(CONFIG_X86_LOCAL_APIC)  += probe_$(BITS).o
++disabled-obj-$(CONFIG_XEN)    := apic_flat_$(BITS).o apic_noop.o
diff --cc arch/x86/kernel/apic/apic-xen.c

index 0000000,0000000..6b0603c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/apic/apic-xen.c
@@@ -1,0 -1,0 +1,69 @@@
++/*
++ *    Local APIC handling stubs
++ */
++
++#include <linux/init.h>
++#include <linux/interrupt.h>
++
++#include <asm/smp.h>
++#include <asm/proto.h>
++#include <asm/apic.h>
++
++unsigned int num_processors;
++
++/*
++ * Debug level, exported for io_apic.c
++ */
++unsigned int apic_verbosity;
++
++/* Have we found an MP table */
++int smp_found_config;
++
++static int __init apic_set_verbosity(char *arg)
++{
++      if (!arg)  {
++#ifdef CONFIG_X86_64
++              skip_ioapic_setup = 0;
++              return 0;
++#endif
++              return -EINVAL;
++      }
++
++      if (strcmp("debug", arg) == 0)
++              apic_verbosity = APIC_DEBUG;
++      else if (strcmp("verbose", arg) == 0)
++              apic_verbosity = APIC_VERBOSE;
++      else {
++              pr_warning("APIC Verbosity level %s not recognised"
++                      " use apic=verbose or apic=debug\n", arg);
++              return -EINVAL;
++      }
++
++      return 0;
++}
++early_param("apic", apic_set_verbosity);
++
++int setup_profiling_timer(unsigned int multiplier)
++{
++      return -EINVAL;
++}
++
++#ifndef CONFIG_SMP
++/*
++ * This initializes the IO-APIC and APIC hardware if this is
++ * a UP kernel.
++ */
++int __init APIC_init_uniprocessor(void)
++{
++#ifdef CONFIG_X86_IO_APIC
++      if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
++              setup_IO_APIC();
++# ifdef CONFIG_X86_64
++      else
++              nr_ioapics = 0;
++# endif
++#endif
++
++      return 0;
++}
++#endif
diff --cc arch/x86/kernel/apic/bigsmp_32.c
Simple merge
diff --cc arch/x86/kernel/apic/hw_nmi.c

index d5e57db,5260fe9..ea81cba
--- 1/arch/x86/kernel/apic/hw_nmi.c
--- 2/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@@ -26,6 -26,6 +26,10 @@@ u64 hw_nmi_get_sample_period(void
   #endif
   
   #ifdef arch_trigger_all_cpu_backtrace
++#ifdef CONFIG_XEN
++#include <asm/ipi.h>
++#endif
++
   /* For reliability, we're prepared to waste bits here. */
   static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
   
@@@ -46,7 -46,7 +50,11 @@@ void arch_trigger_all_cpu_backtrace(voi
         cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
   
         printk(KERN_INFO "sending NMI to all CPUs:\n");
++#ifndef CONFIG_XEN
         apic->send_IPI_all(NMI_VECTOR);
++#else /* this works even without CONFIG_X86_LOCAL_APIC */
++      xen_send_IPI_all(NMI_VECTOR);
++#endif
   
         /* Wait for up to 10 seconds for all CPUs to do the backtrace */
         for (i = 0; i < 10 * 1000; i++) {
diff --cc arch/x86/kernel/apic/io_apic-xen.c

index 0000000,0000000..3737c95

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/apic/io_apic-xen.c
@@@ -1,0 -1,0 +1,4206 @@@
++/*
++ *    Intel IO-APIC support for multi-Pentium hosts.
++ *
++ *    Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
++ *
++ *    Many thanks to Stig Venaas for trying out countless experimental
++ *    patches and reporting/debugging problems patiently!
++ *
++ *    (c) 1999, Multiple IO-APIC support, developed by
++ *    Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
++ *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
++ *    further tested and cleaned up by Zach Brown <zab@redhat.com>
++ *    and Ingo Molnar <mingo@redhat.com>
++ *
++ *    Fixes
++ *    Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
++ *                                    thanks to Eric Gilmore
++ *                                    and Rolf G. Tews
++ *                                    for testing these extensively
++ *    Paul Diefenbaugh        :       Added full ACPI support
++ */
++
++#include <linux/mm.h>
++#include <linux/interrupt.h>
++#include <linux/init.h>
++#include <linux/delay.h>
++#include <linux/sched.h>
++#include <linux/pci.h>
++#include <linux/mc146818rtc.h>
++#include <linux/compiler.h>
++#include <linux/acpi.h>
++#include <linux/module.h>
++#include <linux/syscore_ops.h>
++#include <linux/freezer.h>
++#include <linux/kthread.h>
++#include <linux/jiffies.h>    /* time_after() */
++#include <linux/slab.h>
++#ifdef CONFIG_ACPI
++#include <acpi/acpi_bus.h>
++#endif
++#include <linux/bootmem.h>
++
++#include <asm/idle.h>
++#include <asm/io.h>
++#include <asm/smp.h>
++#include <asm/cpu.h>
++#include <asm/desc.h>
++#include <asm/proto.h>
++#include <asm/acpi.h>
++#include <asm/dma.h>
++#include <asm/timer.h>
++#include <asm/i8259.h>
++#include <asm/setup.h>
++#include <asm/hw_irq.h>
++
++#include <asm/apic.h>
++
++#ifdef CONFIG_XEN
++#include <xen/interface/xen.h>
++#include <xen/interface/physdev.h>
++#include <xen/evtchn.h>
++
++/* Fake i8259 */
++static void make_8259A_irq(unsigned int irq) { io_apic_irqs &= ~(1UL<<irq); }
++static const struct legacy_pic xen_legacy_pic = {
++      .nr_legacy_irqs = NR_IRQS_LEGACY,
++      .make_irq = make_8259A_irq
++};
++#define legacy_pic (&xen_legacy_pic)
++
++unsigned long io_apic_irqs;
++#endif /* CONFIG_XEN */
++
++#define __apicdebuginit(type) static type __init
++#define for_each_irq_pin(entry, head) \
++      for (entry = head; entry; entry = entry->next)
++
++/*
++ *      Is the SiS APIC rmw bug present ?
++ *      -1 = don't know, 0 = no, 1 = yes
++ */
++int sis_apic_bug = -1;
++
++static DEFINE_RAW_SPINLOCK(ioapic_lock);
++#ifndef CONFIG_XEN
++static DEFINE_RAW_SPINLOCK(vector_lock);
++#endif
++
++/*
++ * # of IRQ routing registers
++ */
++int nr_ioapic_registers[MAX_IO_APICS];
++
++/* I/O APIC entries */
++struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
++int nr_ioapics;
++
++/* IO APIC gsi routing info */
++struct mp_ioapic_gsi  mp_gsi_routing[MAX_IO_APICS];
++
++/* The one past the highest gsi number used */
++u32 gsi_top;
++
++/* MP IRQ source entries */
++struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
++
++/* # of MP IRQ source entries */
++int mp_irq_entries;
++
++#ifndef CONFIG_XEN
++/* GSI interrupts */
++static int nr_irqs_gsi = NR_IRQS_LEGACY;
++#endif
++
++#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
++int mp_bus_id_to_type[MAX_MP_BUSSES];
++#endif
++
++DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
++
++int skip_ioapic_setup;
++
++/**
++ * disable_ioapic_support() - disables ioapic support at runtime
++ */
++static void __init _disable_ioapic_support(void)
++{
++#ifdef CONFIG_PCI
++      noioapicquirk = 1;
++      noioapicreroute = -1;
++#endif
++      skip_ioapic_setup = 1;
++}
++
++static int __init parse_noapic(char *str)
++{
++      /* disable IO-APIC */
++      _disable_ioapic_support();
++      return 0;
++}
++early_param("noapic", parse_noapic);
++
++static int io_apic_setup_irq_pin(unsigned int irq, int node,
++                               struct io_apic_irq_attr *attr);
++
++/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
++void mp_save_irq(struct mpc_intsrc *m)
++{
++      int i;
++
++      apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
++              " IRQ %02x, APIC ID %x, APIC INT %02x\n",
++              m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
++              m->srcbusirq, m->dstapic, m->dstirq);
++
++      for (i = 0; i < mp_irq_entries; i++) {
++              if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
++                      return;
++      }
++
++      memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
++      if (++mp_irq_entries == MAX_IRQ_SOURCES)
++              panic("Max # of irq sources exceeded!!\n");
++}
++
++#ifndef CONFIG_XEN
++struct irq_pin_list {
++      int apic, pin;
++      struct irq_pin_list *next;
++};
++
++static struct irq_pin_list *alloc_irq_pin_list(int node)
++{
++      return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
++}
++
++
++/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
++#ifdef CONFIG_SPARSE_IRQ
++static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
++#else
++static struct irq_cfg irq_cfgx[NR_IRQS];
++#endif
++
++int __init arch_early_irq_init(void)
++{
++      struct irq_cfg *cfg;
++      int count, node, i;
++
++      if (!legacy_pic->nr_legacy_irqs) {
++              nr_irqs_gsi = 0;
++              io_apic_irqs = ~0UL;
++      }
++
++      cfg = irq_cfgx;
++      count = ARRAY_SIZE(irq_cfgx);
++      node = cpu_to_node(0);
++
++      /* Make sure the legacy interrupts are marked in the bitmap */
++      irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
++
++      for (i = 0; i < count; i++) {
++              irq_set_chip_data(i, &cfg[i]);
++              zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
++              zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
++              /*
++               * For legacy IRQ's, start with assigning irq0 to irq15 to
++               * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
++               */
++              if (i < legacy_pic->nr_legacy_irqs) {
++                      cfg[i].vector = IRQ0_VECTOR + i;
++                      cpumask_set_cpu(0, cfg[i].domain);
++              }
++      }
++
++      return 0;
++}
++
++#ifdef CONFIG_SPARSE_IRQ
++static struct irq_cfg *irq_cfg(unsigned int irq)
++{
++      return irq_get_chip_data(irq);
++}
++
++static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
++{
++      struct irq_cfg *cfg;
++
++      cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
++      if (!cfg)
++              return NULL;
++      if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
++              goto out_cfg;
++      if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
++              goto out_domain;
++      return cfg;
++out_domain:
++      free_cpumask_var(cfg->domain);
++out_cfg:
++      kfree(cfg);
++      return NULL;
++}
++
++static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
++{
++      if (!cfg)
++              return;
++      irq_set_chip_data(at, NULL);
++      free_cpumask_var(cfg->domain);
++      free_cpumask_var(cfg->old_domain);
++      kfree(cfg);
++}
++
++#else
++
++struct irq_cfg *irq_cfg(unsigned int irq)
++{
++      return irq < nr_irqs ? irq_cfgx + irq : NULL;
++}
++
++static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
++{
++      return irq_cfgx + irq;
++}
++
++static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
++
++#endif
++
++static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
++{
++      int res = irq_alloc_desc_at(at, node);
++      struct irq_cfg *cfg;
++
++      if (res < 0) {
++              if (res != -EEXIST)
++                      return NULL;
++              cfg = irq_get_chip_data(at);
++              if (cfg)
++                      return cfg;
++      }
++
++      cfg = alloc_irq_cfg(at, node);
++      if (cfg)
++              irq_set_chip_data(at, cfg);
++      else
++              irq_free_desc(at);
++      return cfg;
++}
++
++static int alloc_irq_from(unsigned int from, int node)
++{
++      return irq_alloc_desc_from(from, node);
++}
++
++static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
++{
++      free_irq_cfg(at, cfg);
++      irq_free_desc(at);
++}
++
++struct io_apic {
++      unsigned int index;
++      unsigned int unused[3];
++      unsigned int data;
++      unsigned int unused2[11];
++      unsigned int eoi;
++};
++
++static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
++{
++      return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
++              + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
++}
++
++static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
++{
++      struct io_apic __iomem *io_apic = io_apic_base(apic);
++      writel(vector, &io_apic->eoi);
++}
++#endif /* !CONFIG_XEN */
++
++static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
++{
++#ifndef CONFIG_XEN
++      struct io_apic __iomem *io_apic = io_apic_base(apic);
++      writel(reg, &io_apic->index);
++      return readl(&io_apic->data);
++#else
++      struct physdev_apic apic_op;
++      int ret;
++
++      apic_op.apic_physbase = mp_ioapics[apic].apicaddr;
++      apic_op.reg = reg;
++      ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
++      if (ret)
++              return ret;
++      return apic_op.value;
++#endif
++}
++
++static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
++{
++#ifndef CONFIG_XEN
++      struct io_apic __iomem *io_apic = io_apic_base(apic);
++      writel(reg, &io_apic->index);
++      writel(value, &io_apic->data);
++#else
++      struct physdev_apic apic_op;
++
++      apic_op.apic_physbase = mp_ioapics[apic].apicaddr;
++      apic_op.reg = reg;
++      apic_op.value = value;
++      WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
++#endif
++}
++
++#ifdef CONFIG_XEN
++#define io_apic_modify io_apic_write
++#else
++/*
++ * Re-write a value: to be used for read-modify-write
++ * cycles where the read already set up the index register.
++ *
++ * Older SiS APIC requires we rewrite the index register
++ */
++static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
++{
++      struct io_apic __iomem *io_apic = io_apic_base(apic);
++
++      if (sis_apic_bug)
++              writel(reg, &io_apic->index);
++      writel(value, &io_apic->data);
++}
++
++static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
++{
++      struct irq_pin_list *entry;
++      unsigned long flags;
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      for_each_irq_pin(entry, cfg->irq_2_pin) {
++              unsigned int reg;
++              int pin;
++
++              pin = entry->pin;
++              reg = io_apic_read(entry->apic, 0x10 + pin*2);
++              /* Is the remote IRR bit set? */
++              if (reg & IO_APIC_REDIR_REMOTE_IRR) {
++                      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++                      return true;
++              }
++      }
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++
++      return false;
++}
++#endif /* CONFIG_XEN */
++
++union entry_union {
++      struct { u32 w1, w2; };
++      struct IO_APIC_route_entry entry;
++};
++
++#ifndef CONFIG_XEN
++static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
++{
++      union entry_union eu;
++      unsigned long flags;
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
++      eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++      return eu.entry;
++}
++#endif
++
++/*
++ * When we write a new IO APIC routing entry, we need to write the high
++ * word first! If the mask bit in the low word is clear, we will enable
++ * the interrupt, and we need to make sure the entry is fully populated
++ * before that happens.
++ */
++static void
++__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
++{
++      union entry_union eu = {{0, 0}};
++
++      eu.entry = e;
++      io_apic_write(apic, 0x11 + 2*pin, eu.w2);
++      io_apic_write(apic, 0x10 + 2*pin, eu.w1);
++}
++
++static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
++{
++      unsigned long flags;
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      __ioapic_write_entry(apic, pin, e);
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++}
++
++#ifndef CONFIG_XEN
++/*
++ * When we mask an IO APIC routing entry, we need to write the low
++ * word first, in order to set the mask bit before we change the
++ * high bits!
++ */
++static void ioapic_mask_entry(int apic, int pin)
++{
++      unsigned long flags;
++      union entry_union eu = { .entry.mask = 1 };
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      io_apic_write(apic, 0x10 + 2*pin, eu.w1);
++      io_apic_write(apic, 0x11 + 2*pin, eu.w2);
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++}
++
++/*
++ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
++ * shared ISA-space IRQs, so we have to support them. We are super
++ * fast in the common case, and fast for shared ISA-space IRQs.
++ */
++static int
++__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
++{
++      struct irq_pin_list **last, *entry;
++
++      /* don't allow duplicates */
++      last = &cfg->irq_2_pin;
++      for_each_irq_pin(entry, cfg->irq_2_pin) {
++              if (entry->apic == apic && entry->pin == pin)
++                      return 0;
++              last = &entry->next;
++      }
++
++      entry = alloc_irq_pin_list(node);
++      if (!entry) {
++              printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
++                              node, apic, pin);
++              return -ENOMEM;
++      }
++      entry->apic = apic;
++      entry->pin = pin;
++
++      *last = entry;
++      return 0;
++}
++
++static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
++{
++      if (__add_pin_to_irq_node(cfg, node, apic, pin))
++              panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
++}
++
++/*
++ * Reroute an IRQ to a different pin.
++ */
++static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
++                                         int oldapic, int oldpin,
++                                         int newapic, int newpin)
++{
++      struct irq_pin_list *entry;
++
++      for_each_irq_pin(entry, cfg->irq_2_pin) {
++              if (entry->apic == oldapic && entry->pin == oldpin) {
++                      entry->apic = newapic;
++                      entry->pin = newpin;
++                      /* every one is different, right? */
++                      return;
++              }
++      }
++
++      /* old apic/pin didn't exist, so just add new ones */
++      add_pin_to_irq_node(cfg, node, newapic, newpin);
++}
++
++static void __io_apic_modify_irq(struct irq_pin_list *entry,
++                               int mask_and, int mask_or,
++                               void (*final)(struct irq_pin_list *entry))
++{
++      unsigned int reg, pin;
++
++      pin = entry->pin;
++      reg = io_apic_read(entry->apic, 0x10 + pin * 2);
++      reg &= mask_and;
++      reg |= mask_or;
++      io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
++      if (final)
++              final(entry);
++}
++
++static void io_apic_modify_irq(struct irq_cfg *cfg,
++                             int mask_and, int mask_or,
++                             void (*final)(struct irq_pin_list *entry))
++{
++      struct irq_pin_list *entry;
++
++      for_each_irq_pin(entry, cfg->irq_2_pin)
++              __io_apic_modify_irq(entry, mask_and, mask_or, final);
++}
++
++static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
++{
++      __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
++                           IO_APIC_REDIR_MASKED, NULL);
++}
++
++static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
++{
++      __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
++                           IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
++}
++
++static void io_apic_sync(struct irq_pin_list *entry)
++{
++      /*
++       * Synchronize the IO-APIC and the CPU by doing
++       * a dummy read from the IO-APIC
++       */
++      struct io_apic __iomem *io_apic;
++      io_apic = io_apic_base(entry->apic);
++      readl(&io_apic->data);
++}
++
++static void mask_ioapic(struct irq_cfg *cfg)
++{
++      unsigned long flags;
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++}
++
++static void mask_ioapic_irq(struct irq_data *data)
++{
++      mask_ioapic(data->chip_data);
++}
++
++static void __unmask_ioapic(struct irq_cfg *cfg)
++{
++      io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
++}
++
++static void unmask_ioapic(struct irq_cfg *cfg)
++{
++      unsigned long flags;
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      __unmask_ioapic(cfg);
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++}
++
++static void unmask_ioapic_irq(struct irq_data *data)
++{
++      unmask_ioapic(data->chip_data);
++}
++
++static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
++{
++      struct IO_APIC_route_entry entry;
++
++      /* Check delivery_mode to be sure we're not clearing an SMI pin */
++      entry = ioapic_read_entry(apic, pin);
++      if (entry.delivery_mode == dest_SMI)
++              return;
++      /*
++       * Disable it in the IO-APIC irq-routing table:
++       */
++      ioapic_mask_entry(apic, pin);
++}
++
++static void clear_IO_APIC (void)
++{
++      int apic, pin;
++
++      for (apic = 0; apic < nr_ioapics; apic++)
++              for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
++                      clear_IO_APIC_pin(apic, pin);
++}
++#else
++#define add_pin_to_irq_node(cfg, node, apic, pin)
++#define __add_pin_to_irq_node(cfg, node, apic, pin) 0
++#endif /* !CONFIG_XEN */
++
++#ifdef CONFIG_X86_32
++/*
++ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
++ * specific CPU-side IRQs.
++ */
++
++#define MAX_PIRQS 8
++static int pirq_entries[MAX_PIRQS] = {
++      [0 ... MAX_PIRQS - 1] = -1
++};
++
++static int __init ioapic_pirq_setup(char *str)
++{
++      int i, max;
++      int ints[MAX_PIRQS+1];
++
++      get_options(str, ARRAY_SIZE(ints), ints);
++
++      apic_printk(APIC_VERBOSE, KERN_INFO
++                      "PIRQ redirection, working around broken MP-BIOS.\n");
++      max = MAX_PIRQS;
++      if (ints[0] < MAX_PIRQS)
++              max = ints[0];
++
++      for (i = 0; i < max; i++) {
++              apic_printk(APIC_VERBOSE, KERN_DEBUG
++                              "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
++              /*
++               * PIRQs are mapped upside down, usually.
++               */
++              pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
++      }
++      return 1;
++}
++
++__setup("pirq=", ioapic_pirq_setup);
++#endif /* CONFIG_X86_32 */
++
++#ifndef CONFIG_XEN
++struct IO_APIC_route_entry **alloc_ioapic_entries(void)
++{
++      int apic;
++      struct IO_APIC_route_entry **ioapic_entries;
++
++      ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
++                              GFP_KERNEL);
++      if (!ioapic_entries)
++              return 0;
++
++      for (apic = 0; apic < nr_ioapics; apic++) {
++              ioapic_entries[apic] =
++                      kzalloc(sizeof(struct IO_APIC_route_entry) *
++                              nr_ioapic_registers[apic], GFP_KERNEL);
++              if (!ioapic_entries[apic])
++                      goto nomem;
++      }
++
++      return ioapic_entries;
++
++nomem:
++      while (--apic >= 0)
++              kfree(ioapic_entries[apic]);
++      kfree(ioapic_entries);
++
++      return 0;
++}
++
++/*
++ * Saves all the IO-APIC RTE's
++ */
++int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
++{
++      int apic, pin;
++
++      if (!ioapic_entries)
++              return -ENOMEM;
++
++      for (apic = 0; apic < nr_ioapics; apic++) {
++              if (!ioapic_entries[apic])
++                      return -ENOMEM;
++
++              for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
++                      ioapic_entries[apic][pin] =
++                              ioapic_read_entry(apic, pin);
++      }
++
++      return 0;
++}
++
++/*
++ * Mask all IO APIC entries.
++ */
++void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
++{
++      int apic, pin;
++
++      if (!ioapic_entries)
++              return;
++
++      for (apic = 0; apic < nr_ioapics; apic++) {
++              if (!ioapic_entries[apic])
++                      break;
++
++              for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
++                      struct IO_APIC_route_entry entry;
++
++                      entry = ioapic_entries[apic][pin];
++                      if (!entry.mask) {
++                              entry.mask = 1;
++                              ioapic_write_entry(apic, pin, entry);
++                      }
++              }
++      }
++}
++
++/*
++ * Restore IO APIC entries which was saved in ioapic_entries.
++ */
++int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
++{
++      int apic, pin;
++
++      if (!ioapic_entries)
++              return -ENOMEM;
++
++      for (apic = 0; apic < nr_ioapics; apic++) {
++              if (!ioapic_entries[apic])
++                      return -ENOMEM;
++
++              for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
++                      ioapic_write_entry(apic, pin,
++                                      ioapic_entries[apic][pin]);
++      }
++      return 0;
++}
++
++void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
++{
++      int apic;
++
++      for (apic = 0; apic < nr_ioapics; apic++)
++              kfree(ioapic_entries[apic]);
++
++      kfree(ioapic_entries);
++}
++#endif /* CONFIG_XEN */
++
++/*
++ * Find the IRQ entry number of a certain pin.
++ */
++static int find_irq_entry(int apic, int pin, int type)
++{
++      int i;
++
++      for (i = 0; i < mp_irq_entries; i++)
++              if (mp_irqs[i].irqtype == type &&
++                  (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
++                   mp_irqs[i].dstapic == MP_APIC_ALL) &&
++                  mp_irqs[i].dstirq == pin)
++                      return i;
++
++      return -1;
++}
++
++#ifndef CONFIG_XEN
++/*
++ * Find the pin to which IRQ[irq] (ISA) is connected
++ */
++static int __init find_isa_irq_pin(int irq, int type)
++{
++      int i;
++
++      for (i = 0; i < mp_irq_entries; i++) {
++              int lbus = mp_irqs[i].srcbus;
++
++              if (test_bit(lbus, mp_bus_not_pci) &&
++                  (mp_irqs[i].irqtype == type) &&
++                  (mp_irqs[i].srcbusirq == irq))
++
++                      return mp_irqs[i].dstirq;
++      }
++      return -1;
++}
++
++static int __init find_isa_irq_apic(int irq, int type)
++{
++      int i;
++
++      for (i = 0; i < mp_irq_entries; i++) {
++              int lbus = mp_irqs[i].srcbus;
++
++              if (test_bit(lbus, mp_bus_not_pci) &&
++                  (mp_irqs[i].irqtype == type) &&
++                  (mp_irqs[i].srcbusirq == irq))
++                      break;
++      }
++      if (i < mp_irq_entries) {
++              int apic;
++              for(apic = 0; apic < nr_ioapics; apic++) {
++                      if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
++                              return apic;
++              }
++      }
++
++      return -1;
++}
++#endif
++
++#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
++/*
++ * EISA Edge/Level control register, ELCR
++ */
++static int EISA_ELCR(unsigned int irq)
++{
++      if (irq < legacy_pic->nr_legacy_irqs) {
++              unsigned int port = 0x4d0 + (irq >> 3);
++              return (inb(port) >> (irq & 7)) & 1;
++      }
++      apic_printk(APIC_VERBOSE, KERN_INFO
++                      "Broken MPtable reports ISA irq %d\n", irq);
++      return 0;
++}
++
++#endif
++
++/* ISA interrupts are always polarity zero edge triggered,
++ * when listed as conforming in the MP table. */
++
++#define default_ISA_trigger(idx)      (0)
++#define default_ISA_polarity(idx)     (0)
++
++/* EISA interrupts are always polarity zero and can be edge or level
++ * trigger depending on the ELCR value.  If an interrupt is listed as
++ * EISA conforming in the MP table, that means its trigger type must
++ * be read in from the ELCR */
++
++#define default_EISA_trigger(idx)     (EISA_ELCR(mp_irqs[idx].srcbusirq))
++#define default_EISA_polarity(idx)    default_ISA_polarity(idx)
++
++/* PCI interrupts are always polarity one level triggered,
++ * when listed as conforming in the MP table. */
++
++#define default_PCI_trigger(idx)      (1)
++#define default_PCI_polarity(idx)     (1)
++
++/* MCA interrupts are always polarity zero level triggered,
++ * when listed as conforming in the MP table. */
++
++#define default_MCA_trigger(idx)      (1)
++#define default_MCA_polarity(idx)     default_ISA_polarity(idx)
++
++static int irq_polarity(int idx)
++{
++      int bus = mp_irqs[idx].srcbus;
++      int polarity;
++
++      /*
++       * Determine IRQ line polarity (high active or low active):
++       */
++      switch (mp_irqs[idx].irqflag & 3)
++      {
++              case 0: /* conforms, ie. bus-type dependent polarity */
++                      if (test_bit(bus, mp_bus_not_pci))
++                              polarity = default_ISA_polarity(idx);
++                      else
++                              polarity = default_PCI_polarity(idx);
++                      break;
++              case 1: /* high active */
++              {
++                      polarity = 0;
++                      break;
++              }
++              case 2: /* reserved */
++              {
++                      printk(KERN_WARNING "broken BIOS!!\n");
++                      polarity = 1;
++                      break;
++              }
++              case 3: /* low active */
++              {
++                      polarity = 1;
++                      break;
++              }
++              default: /* invalid */
++              {
++                      printk(KERN_WARNING "broken BIOS!!\n");
++                      polarity = 1;
++                      break;
++              }
++      }
++      return polarity;
++}
++
++static int irq_trigger(int idx)
++{
++      int bus = mp_irqs[idx].srcbus;
++      int trigger;
++
++      /*
++       * Determine IRQ trigger mode (edge or level sensitive):
++       */
++      switch ((mp_irqs[idx].irqflag>>2) & 3)
++      {
++              case 0: /* conforms, ie. bus-type dependent */
++                      if (test_bit(bus, mp_bus_not_pci))
++                              trigger = default_ISA_trigger(idx);
++                      else
++                              trigger = default_PCI_trigger(idx);
++#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
++                      switch (mp_bus_id_to_type[bus]) {
++                              case MP_BUS_ISA: /* ISA pin */
++                              {
++                                      /* set before the switch */
++                                      break;
++                              }
++                              case MP_BUS_EISA: /* EISA pin */
++                              {
++                                      trigger = default_EISA_trigger(idx);
++                                      break;
++                              }
++                              case MP_BUS_PCI: /* PCI pin */
++                              {
++                                      /* set before the switch */
++                                      break;
++                              }
++                              case MP_BUS_MCA: /* MCA pin */
++                              {
++                                      trigger = default_MCA_trigger(idx);
++                                      break;
++                              }
++                              default:
++                              {
++                                      printk(KERN_WARNING "broken BIOS!!\n");
++                                      trigger = 1;
++                                      break;
++                              }
++                      }
++#endif
++                      break;
++              case 1: /* edge */
++              {
++                      trigger = 0;
++                      break;
++              }
++              case 2: /* reserved */
++              {
++                      printk(KERN_WARNING "broken BIOS!!\n");
++                      trigger = 1;
++                      break;
++              }
++              case 3: /* level */
++              {
++                      trigger = 1;
++                      break;
++              }
++              default: /* invalid */
++              {
++                      printk(KERN_WARNING "broken BIOS!!\n");
++                      trigger = 0;
++                      break;
++              }
++      }
++      return trigger;
++}
++
++static int pin_2_irq(int idx, int apic, int pin)
++{
++      int irq;
++      int bus = mp_irqs[idx].srcbus;
++
++      /*
++       * Debugging check, we are in big trouble if this message pops up!
++       */
++      if (mp_irqs[idx].dstirq != pin)
++              printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
++
++      if (test_bit(bus, mp_bus_not_pci)) {
++              irq = mp_irqs[idx].srcbusirq;
++      } else {
++              u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
++
++              if (gsi >= NR_IRQS_LEGACY)
++                      irq = gsi;
++              else
++                      irq = gsi_top + gsi;
++      }
++
++#ifdef CONFIG_X86_32
++      /*
++       * PCI IRQ command line redirection. Yes, limits are hardcoded.
++       */
++      if ((pin >= 16) && (pin <= 23)) {
++              if (pirq_entries[pin-16] != -1) {
++                      if (!pirq_entries[pin-16]) {
++                              apic_printk(APIC_VERBOSE, KERN_DEBUG
++                                              "disabling PIRQ%d\n", pin-16);
++                      } else {
++                              irq = pirq_entries[pin-16];
++                              apic_printk(APIC_VERBOSE, KERN_DEBUG
++                                              "using PIRQ%d -> IRQ %d\n",
++                                              pin-16, irq);
++                      }
++              }
++      }
++#endif
++
++      return irq;
++}
++
++/*
++ * Find a specific PCI IRQ entry.
++ * Not an __init, possibly needed by modules
++ */
++int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
++                              struct io_apic_irq_attr *irq_attr)
++{
++      int apic, i, best_guess = -1;
++
++      apic_printk(APIC_DEBUG,
++                  "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
++                  bus, slot, pin);
++      if (test_bit(bus, mp_bus_not_pci)) {
++              apic_printk(APIC_VERBOSE,
++                          "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
++              return -1;
++      }
++      for (i = 0; i < mp_irq_entries; i++) {
++              int lbus = mp_irqs[i].srcbus;
++
++              for (apic = 0; apic < nr_ioapics; apic++)
++                      if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
++                          mp_irqs[i].dstapic == MP_APIC_ALL)
++                              break;
++
++              if (!test_bit(lbus, mp_bus_not_pci) &&
++                  !mp_irqs[i].irqtype &&
++                  (bus == lbus) &&
++                  (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
++                      int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
++
++                      if (!(apic || IO_APIC_IRQ(irq)))
++                              continue;
++
++                      if (pin == (mp_irqs[i].srcbusirq & 3)) {
++                              set_io_apic_irq_attr(irq_attr, apic,
++                                                   mp_irqs[i].dstirq,
++                                                   irq_trigger(i),
++                                                   irq_polarity(i));
++                              return irq;
++                      }
++                      /*
++                       * Use the first all-but-pin matching entry as a
++                       * best-guess fuzzy result for broken mptables.
++                       */
++                      if (best_guess < 0) {
++                              set_io_apic_irq_attr(irq_attr, apic,
++                                                   mp_irqs[i].dstirq,
++                                                   irq_trigger(i),
++                                                   irq_polarity(i));
++                              best_guess = irq;
++                      }
++              }
++      }
++      return best_guess;
++}
++EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
++
++#ifndef CONFIG_XEN
++void lock_vector_lock(void)
++{
++      /* Used to the online set of cpus does not change
++       * during assign_irq_vector.
++       */
++      raw_spin_lock(&vector_lock);
++}
++
++void unlock_vector_lock(void)
++{
++      raw_spin_unlock(&vector_lock);
++}
++
++static int
++__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
++{
++      /*
++       * NOTE! The local APIC isn't very good at handling
++       * multiple interrupts at the same interrupt level.
++       * As the interrupt level is determined by taking the
++       * vector number and shifting that right by 4, we
++       * want to spread these out a bit so that they don't
++       * all fall in the same interrupt level.
++       *
++       * Also, we've got to be careful not to trash gate
++       * 0x80, because int 0x80 is hm, kind of importantish. ;)
++       */
++      static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
++      static int current_offset = VECTOR_OFFSET_START % 8;
++      unsigned int old_vector;
++      int cpu, err;
++      cpumask_var_t tmp_mask;
++
++      if (cfg->move_in_progress)
++              return -EBUSY;
++
++      if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
++              return -ENOMEM;
++
++      old_vector = cfg->vector;
++      if (old_vector) {
++              cpumask_and(tmp_mask, mask, cpu_online_mask);
++              cpumask_and(tmp_mask, cfg->domain, tmp_mask);
++              if (!cpumask_empty(tmp_mask)) {
++                      free_cpumask_var(tmp_mask);
++                      return 0;
++              }
++      }
++
++      /* Only try and allocate irqs on cpus that are present */
++      err = -ENOSPC;
++      for_each_cpu_and(cpu, mask, cpu_online_mask) {
++              int new_cpu;
++              int vector, offset;
++
++              apic->vector_allocation_domain(cpu, tmp_mask);
++
++              vector = current_vector;
++              offset = current_offset;
++next:
++              vector += 8;
++              if (vector >= first_system_vector) {
++                      /* If out of vectors on large boxen, must share them. */
++                      offset = (offset + 1) % 8;
++                      vector = FIRST_EXTERNAL_VECTOR + offset;
++              }
++              if (unlikely(current_vector == vector))
++                      continue;
++
++              if (test_bit(vector, used_vectors))
++                      goto next;
++
++#ifdef CONFIG_KDB
++              if (vector == KDBENTER_VECTOR)
++                      goto next;
++#endif        /* CONFIG_KDB */
++              for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
++                      if (per_cpu(vector_irq, new_cpu)[vector] != -1)
++                              goto next;
++              /* Found one! */
++              current_vector = vector;
++              current_offset = offset;
++              if (old_vector) {
++                      cfg->move_in_progress = 1;
++                      cpumask_copy(cfg->old_domain, cfg->domain);
++              }
++              for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
++                      per_cpu(vector_irq, new_cpu)[vector] = irq;
++              cfg->vector = vector;
++              cpumask_copy(cfg->domain, tmp_mask);
++              err = 0;
++              break;
++      }
++      free_cpumask_var(tmp_mask);
++      return err;
++}
++
++int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
++{
++      int err;
++      unsigned long flags;
++
++      raw_spin_lock_irqsave(&vector_lock, flags);
++      err = __assign_irq_vector(irq, cfg, mask);
++      raw_spin_unlock_irqrestore(&vector_lock, flags);
++      return err;
++}
++
++static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
++{
++      int cpu, vector;
++
++      BUG_ON(!cfg->vector);
++
++      vector = cfg->vector;
++      for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
++              per_cpu(vector_irq, cpu)[vector] = -1;
++
++      cfg->vector = 0;
++      cpumask_clear(cfg->domain);
++
++      if (likely(!cfg->move_in_progress))
++              return;
++      for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
++              for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
++                                                              vector++) {
++                      if (per_cpu(vector_irq, cpu)[vector] != irq)
++                              continue;
++                      per_cpu(vector_irq, cpu)[vector] = -1;
++                      break;
++              }
++      }
++      cfg->move_in_progress = 0;
++}
++
++void __setup_vector_irq(int cpu)
++{
++      /* Initialize vector_irq on a new cpu */
++      int irq, vector;
++      struct irq_cfg *cfg;
++
++      /*
++       * vector_lock will make sure that we don't run into irq vector
++       * assignments that might be happening on another cpu in parallel,
++       * while we setup our initial vector to irq mappings.
++       */
++      raw_spin_lock(&vector_lock);
++      /* Mark the inuse vectors */
++      for_each_active_irq(irq) {
++              cfg = irq_get_chip_data(irq);
++              if (!cfg)
++                      continue;
++              /*
++               * If it is a legacy IRQ handled by the legacy PIC, this cpu
++               * will be part of the irq_cfg's domain.
++               */
++              if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
++                      cpumask_set_cpu(cpu, cfg->domain);
++
++              if (!cpumask_test_cpu(cpu, cfg->domain))
++                      continue;
++              vector = cfg->vector;
++              per_cpu(vector_irq, cpu)[vector] = irq;
++      }
++      /* Mark the free vectors */
++      for (vector = 0; vector < NR_VECTORS; ++vector) {
++              irq = per_cpu(vector_irq, cpu)[vector];
++              if (irq < 0)
++                      continue;
++
++              cfg = irq_cfg(irq);
++              if (!cpumask_test_cpu(cpu, cfg->domain))
++                      per_cpu(vector_irq, cpu)[vector] = -1;
++      }
++      raw_spin_unlock(&vector_lock);
++}
++
++static struct irq_chip ioapic_chip;
++static struct irq_chip ir_ioapic_chip;
++
++#ifdef CONFIG_X86_32
++static inline int IO_APIC_irq_trigger(int irq)
++{
++      int apic, idx, pin;
++
++      for (apic = 0; apic < nr_ioapics; apic++) {
++              for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
++                      idx = find_irq_entry(apic, pin, mp_INT);
++                      if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
++                              return irq_trigger(idx);
++              }
++      }
++      /*
++         * nonexistent IRQs are edge default
++         */
++      return 0;
++}
++#else
++static inline int IO_APIC_irq_trigger(int irq)
++{
++      return 1;
++}
++#endif
++
++static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
++                               unsigned long trigger)
++{
++      struct irq_chip *chip = &ioapic_chip;
++      irq_flow_handler_t hdl;
++      bool fasteoi;
++
++      if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
++          trigger == IOAPIC_LEVEL) {
++              irq_set_status_flags(irq, IRQ_LEVEL);
++              fasteoi = true;
++      } else {
++              irq_clear_status_flags(irq, IRQ_LEVEL);
++              fasteoi = false;
++      }
++
++      if (irq_remapped(cfg)) {
++              irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
++              chip = &ir_ioapic_chip;
++              fasteoi = trigger != 0;
++      }
++
++      hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
++      irq_set_chip_and_handler_name(irq, chip, hdl,
++                                    fasteoi ? "fasteoi" : "edge");
++}
++#else /* !CONFIG_XEN */
++#define __clear_irq_vector(irq, cfg) ((void)0)
++#define ioapic_register_intr(irq, cfg, trigger) evtchn_register_pirq(irq)
++#endif
++
++static int setup_ioapic_entry(int apic_id, int irq,
++                            struct IO_APIC_route_entry *entry,
++                            unsigned int destination, int trigger,
++                            int polarity, int vector, int pin)
++{
++      /*
++       * add it to the IO-APIC irq-routing table:
++       */
++      memset(entry,0,sizeof(*entry));
++
++#ifndef CONFIG_XEN
++      if (intr_remapping_enabled) {
++              struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
++              struct irte irte;
++              struct IR_IO_APIC_route_entry *ir_entry =
++                      (struct IR_IO_APIC_route_entry *) entry;
++              int index;
++
++              if (!iommu)
++                      panic("No mapping iommu for ioapic %d\n", apic_id);
++
++              index = alloc_irte(iommu, irq, 1);
++              if (index < 0)
++                      panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
++
++              prepare_irte(&irte, vector, destination);
++
++              /* Set source-id of interrupt request */
++              set_ioapic_sid(&irte, apic_id);
++
++              modify_irte(irq, &irte);
++
++              ir_entry->index2 = (index >> 15) & 0x1;
++              ir_entry->zero = 0;
++              ir_entry->format = 1;
++              ir_entry->index = (index & 0x7fff);
++              /*
++               * IO-APIC RTE will be configured with virtual vector.
++               * irq handler will do the explicit EOI to the io-apic.
++               */
++              ir_entry->vector = pin;
++      } else
++#endif
++      {
++              entry->delivery_mode = apic->irq_delivery_mode;
++              entry->dest_mode = apic->irq_dest_mode;
++              entry->dest = destination;
++              entry->vector = vector;
++      }
++
++      entry->mask = 0;                                /* enable IRQ */
++      entry->trigger = trigger;
++      entry->polarity = polarity;
++
++      /* Mask level triggered irqs.
++       * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
++       */
++      if (trigger)
++              entry->mask = 1;
++      return 0;
++}
++
++static struct {
++      DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
++} mp_ioapic_routing[MAX_IO_APICS];
++
++static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
++                           struct irq_cfg *cfg, int trigger, int polarity)
++{
++      struct IO_APIC_route_entry entry;
++      unsigned int dest;
++
++      if (!IO_APIC_IRQ(irq))
++              return;
++#ifndef CONFIG_XEN
++      /*
++       * For legacy irqs, cfg->domain starts with cpu 0 for legacy
++       * controllers like 8259. Now that IO-APIC can handle this irq, update
++       * the cfg->domain.
++       */
++      if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
++              apic->vector_allocation_domain(0, cfg->domain);
++#else
++      /*
++       * For legacy IRQs we may get here before trigger mode and polarity
++       * get obtained, but Xen refuses to set those through
++       * PHYSDEVOP_setup_gsi more than once (perhaps even at all).
++       */
++      if (irq >= legacy_pic->nr_legacy_irqs
++          || test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
++              struct physdev_setup_gsi setup_gsi = {
++                      .gsi = irq,
++                      .triggering = trigger,
++                      .polarity = polarity
++              };
++              struct physdev_map_pirq map_pirq = {
++                      .domid = DOMID_SELF,
++                      .type = MAP_PIRQ_TYPE_GSI,
++                      .index = irq,
++                      .pirq = irq
++              };
++
++              switch (HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi,
++                                            &setup_gsi)) {
++              case -EEXIST:
++                      if (irq < legacy_pic->nr_legacy_irqs)
++                              break;
++                      /* fall through */
++              case 0:
++                      evtchn_register_pirq(irq);
++                      if (HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq,
++                                                &map_pirq) == 0) {
++                              /* fake (for init_IO_APIC_traps()): */
++                              cfg->vector = irq;
++                              return;
++                      }
++              }
++      }
++#endif
++
++      if (assign_irq_vector(irq, cfg, apic->target_cpus()))
++              return;
++
++#ifndef CONFIG_XEN
++      dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
++#else
++      dest = 0; /* meaningless */
++#endif
++
++      apic_printk(APIC_VERBOSE,KERN_DEBUG
++                  "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
++                  "IRQ %d Mode:%i Active:%i)\n",
++                  apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
++                  irq, trigger, polarity);
++
++
++      if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
++                             dest, trigger, polarity, cfg->vector, pin)) {
++              printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
++                     mp_ioapics[apic_id].apicid, pin);
++              __clear_irq_vector(irq, cfg);
++              return;
++      }
++
++      ioapic_register_intr(irq, cfg, trigger);
++#ifndef CONFIG_XEN
++      if (irq < legacy_pic->nr_legacy_irqs)
++              legacy_pic->mask(irq);
++#endif
++
++      ioapic_write_entry(apic_id, pin, entry);
++}
++
++static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
++{
++      if (idx != -1)
++              return false;
++
++      apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
++                  mp_ioapics[apic_id].apicid, pin);
++      return true;
++}
++
++static void __init __io_apic_setup_irqs(unsigned int apic_id)
++{
++      int idx, node = cpu_to_node(0);
++      struct io_apic_irq_attr attr;
++      unsigned int pin, irq;
++
++      for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
++              idx = find_irq_entry(apic_id, pin, mp_INT);
++              if (io_apic_pin_not_connected(idx, apic_id, pin))
++                      continue;
++
++              irq = pin_2_irq(idx, apic_id, pin);
++
++              if ((apic_id > 0) && (irq > 16))
++                      continue;
++
++#ifdef CONFIG_XEN
++              if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
++                      continue;
++#else
++              /*
++               * Skip the timer IRQ if there's a quirk handler
++               * installed and if it returns 1:
++               */
++              if (apic->multi_timer_check &&
++                  apic->multi_timer_check(apic_id, irq))
++                      continue;
++#endif
++
++              set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
++                                   irq_polarity(idx));
++
++              io_apic_setup_irq_pin(irq, node, &attr);
++      }
++}
++
++static void __init setup_IO_APIC_irqs(void)
++{
++      unsigned int apic_id;
++
++      apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
++
++      for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
++              __io_apic_setup_irqs(apic_id);
++}
++
++/*
++ * for the gsit that is not in first ioapic
++ * but could not use acpi_register_gsi()
++ * like some special sci in IBM x3330
++ */
++void setup_IO_APIC_irq_extra(u32 gsi)
++{
++      int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
++      struct io_apic_irq_attr attr;
++
++      /*
++       * Convert 'gsi' to 'ioapic.pin'.
++       */
++      apic_id = mp_find_ioapic(gsi);
++      if (apic_id < 0)
++              return;
++
++      pin = mp_find_ioapic_pin(apic_id, gsi);
++      idx = find_irq_entry(apic_id, pin, mp_INT);
++      if (idx == -1)
++              return;
++
++      irq = pin_2_irq(idx, apic_id, pin);
++#ifdef CONFIG_XEN
++      if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
++              return;
++#endif
++
++      /* Only handle the non legacy irqs on secondary ioapics */
++      if (apic_id == 0 || irq < NR_IRQS_LEGACY)
++              return;
++
++      set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
++                           irq_polarity(idx));
++
++      io_apic_setup_irq_pin_once(irq, node, &attr);
++}
++
++#ifndef CONFIG_XEN
++/*
++ * Set up the timer pin, possibly with the 8259A-master behind.
++ */
++static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
++                                      int vector)
++{
++      struct IO_APIC_route_entry entry;
++
++      if (intr_remapping_enabled)
++              return;
++
++      memset(&entry, 0, sizeof(entry));
++
++      /*
++       * We use logical delivery to get the timer IRQ
++       * to the first CPU.
++       */
++      entry.dest_mode = apic->irq_dest_mode;
++      entry.mask = 0;                 /* don't mask IRQ for edge */
++      entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
++      entry.delivery_mode = apic->irq_delivery_mode;
++      entry.polarity = 0;
++      entry.trigger = 0;
++      entry.vector = vector;
++
++      /*
++       * The timer IRQ doesn't have to know that behind the
++       * scene we may have a 8259A-master in AEOI mode ...
++       */
++      irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
++                                    "edge");
++
++      /*
++       * Add it to the IO-APIC irq-routing table:
++       */
++      ioapic_write_entry(apic_id, pin, entry);
++}
++
++
++__apicdebuginit(void) print_IO_APIC(void)
++{
++      int apic, i;
++      union IO_APIC_reg_00 reg_00;
++      union IO_APIC_reg_01 reg_01;
++      union IO_APIC_reg_02 reg_02;
++      union IO_APIC_reg_03 reg_03;
++      unsigned long flags;
++      struct irq_cfg *cfg;
++      unsigned int irq;
++
++      printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
++      for (i = 0; i < nr_ioapics; i++)
++              printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
++                     mp_ioapics[i].apicid, nr_ioapic_registers[i]);
++
++      /*
++       * We are a bit conservative about what we expect.  We have to
++       * know about every hardware change ASAP.
++       */
++      printk(KERN_INFO "testing the IO APIC.......................\n");
++
++      for (apic = 0; apic < nr_ioapics; apic++) {
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      reg_00.raw = io_apic_read(apic, 0);
++      reg_01.raw = io_apic_read(apic, 1);
++      if (reg_01.bits.version >= 0x10)
++              reg_02.raw = io_apic_read(apic, 2);
++      if (reg_01.bits.version >= 0x20)
++              reg_03.raw = io_apic_read(apic, 3);
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++
++      printk("\n");
++      printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
++      printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
++      printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
++      printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
++      printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
++
++      printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
++      printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
++
++      printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
++      printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
++
++      /*
++       * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
++       * but the value of reg_02 is read as the previous read register
++       * value, so ignore it if reg_02 == reg_01.
++       */
++      if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
++              printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
++              printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
++      }
++
++      /*
++       * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
++       * or reg_03, but the value of reg_0[23] is read as the previous read
++       * register value, so ignore it if reg_03 == reg_0[12].
++       */
++      if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
++          reg_03.raw != reg_01.raw) {
++              printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
++              printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
++      }
++
++      printk(KERN_DEBUG ".... IRQ redirection table:\n");
++
++      printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
++                        " Stat Dmod Deli Vect:\n");
++
++      for (i = 0; i <= reg_01.bits.entries; i++) {
++              struct IO_APIC_route_entry entry;
++
++              entry = ioapic_read_entry(apic, i);
++
++              printk(KERN_DEBUG " %02x %03X ",
++                      i,
++                      entry.dest
++              );
++
++              printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
++                      entry.mask,
++                      entry.trigger,
++                      entry.irr,
++                      entry.polarity,
++                      entry.delivery_status,
++                      entry.dest_mode,
++                      entry.delivery_mode,
++                      entry.vector
++              );
++      }
++      }
++      printk(KERN_DEBUG "IRQ to pin mappings:\n");
++      for_each_active_irq(irq) {
++              struct irq_pin_list *entry;
++
++              cfg = irq_get_chip_data(irq);
++              if (!cfg)
++                      continue;
++              entry = cfg->irq_2_pin;
++              if (!entry)
++                      continue;
++              printk(KERN_DEBUG "IRQ%d ", irq);
++              for_each_irq_pin(entry, cfg->irq_2_pin)
++                      printk("-> %d:%d", entry->apic, entry->pin);
++              printk("\n");
++      }
++
++      printk(KERN_INFO ".................................... done.\n");
++
++      return;
++}
++
++__apicdebuginit(void) print_APIC_field(int base)
++{
++      int i;
++
++      printk(KERN_DEBUG);
++
++      for (i = 0; i < 8; i++)
++              printk(KERN_CONT "%08x", apic_read(base + i*0x10));
++
++      printk(KERN_CONT "\n");
++}
++
++__apicdebuginit(void) print_local_APIC(void *dummy)
++{
++      unsigned int i, v, ver, maxlvt;
++      u64 icr;
++
++      printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
++              smp_processor_id(), hard_smp_processor_id());
++      v = apic_read(APIC_ID);
++      printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, read_apic_id());
++      v = apic_read(APIC_LVR);
++      printk(KERN_INFO "... APIC VERSION: %08x\n", v);
++      ver = GET_APIC_VERSION(v);
++      maxlvt = lapic_get_maxlvt();
++
++      v = apic_read(APIC_TASKPRI);
++      printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
++
++      if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
++              if (!APIC_XAPIC(ver)) {
++                      v = apic_read(APIC_ARBPRI);
++                      printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
++                             v & APIC_ARBPRI_MASK);
++              }
++              v = apic_read(APIC_PROCPRI);
++              printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
++      }
++
++      /*
++       * Remote read supported only in the 82489DX and local APIC for
++       * Pentium processors.
++       */
++      if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
++              v = apic_read(APIC_RRR);
++              printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
++      }
++
++      v = apic_read(APIC_LDR);
++      printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
++      if (!x2apic_enabled()) {
++              v = apic_read(APIC_DFR);
++              printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
++      }
++      v = apic_read(APIC_SPIV);
++      printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
++
++      printk(KERN_DEBUG "... APIC ISR field:\n");
++      print_APIC_field(APIC_ISR);
++      printk(KERN_DEBUG "... APIC TMR field:\n");
++      print_APIC_field(APIC_TMR);
++      printk(KERN_DEBUG "... APIC IRR field:\n");
++      print_APIC_field(APIC_IRR);
++
++      if (APIC_INTEGRATED(ver)) {             /* !82489DX */
++              if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
++                      apic_write(APIC_ESR, 0);
++
++              v = apic_read(APIC_ESR);
++              printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
++      }
++
++      icr = apic_icr_read();
++      printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
++      printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
++
++      v = apic_read(APIC_LVTT);
++      printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
++
++      if (maxlvt > 3) {                       /* PC is LVT#4. */
++              v = apic_read(APIC_LVTPC);
++              printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
++      }
++      v = apic_read(APIC_LVT0);
++      printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
++      v = apic_read(APIC_LVT1);
++      printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
++
++      if (maxlvt > 2) {                       /* ERR is LVT#3. */
++              v = apic_read(APIC_LVTERR);
++              printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
++      }
++
++      v = apic_read(APIC_TMICT);
++      printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
++      v = apic_read(APIC_TMCCT);
++      printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
++      v = apic_read(APIC_TDCR);
++      printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
++
++      if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
++              v = apic_read(APIC_EFEAT);
++              maxlvt = (v >> 16) & 0xff;
++              printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
++              v = apic_read(APIC_ECTRL);
++              printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
++              for (i = 0; i < maxlvt; i++) {
++                      v = apic_read(APIC_EILVTn(i));
++                      printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
++              }
++      }
++      printk("\n");
++}
++
++__apicdebuginit(void) print_local_APICs(int maxcpu)
++{
++      int cpu;
++
++      if (!maxcpu)
++              return;
++
++      preempt_disable();
++      for_each_online_cpu(cpu) {
++              if (cpu >= maxcpu)
++                      break;
++              smp_call_function_single(cpu, print_local_APIC, NULL, 1);
++      }
++      preempt_enable();
++}
++
++__apicdebuginit(void) print_PIC(void)
++{
++      unsigned int v;
++      unsigned long flags;
++
++      if (!legacy_pic->nr_legacy_irqs)
++              return;
++
++      printk(KERN_DEBUG "\nprinting PIC contents\n");
++
++      raw_spin_lock_irqsave(&i8259A_lock, flags);
++
++      v = inb(0xa1) << 8 | inb(0x21);
++      printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
++
++      v = inb(0xa0) << 8 | inb(0x20);
++      printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
++
++      outb(0x0b,0xa0);
++      outb(0x0b,0x20);
++      v = inb(0xa0) << 8 | inb(0x20);
++      outb(0x0a,0xa0);
++      outb(0x0a,0x20);
++
++      raw_spin_unlock_irqrestore(&i8259A_lock, flags);
++
++      printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
++
++      v = inb(0x4d1) << 8 | inb(0x4d0);
++      printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
++}
++
++static int __initdata show_lapic = 1;
++static __init int setup_show_lapic(char *arg)
++{
++      int num = -1;
++
++      if (strcmp(arg, "all") == 0) {
++              show_lapic = CONFIG_NR_CPUS;
++      } else {
++              get_option(&arg, &num);
++              if (num >= 0)
++                      show_lapic = num;
++      }
++
++      return 1;
++}
++__setup("show_lapic=", setup_show_lapic);
++
++__apicdebuginit(int) print_ICs(void)
++{
++      if (apic_verbosity == APIC_QUIET)
++              return 0;
++
++      print_PIC();
++
++      /* don't print out if apic is not there */
++      if (!cpu_has_apic && !apic_from_smp_config())
++              return 0;
++
++      print_local_APICs(show_lapic);
++      print_IO_APIC();
++
++      return 0;
++}
++
++fs_initcall(print_ICs);
++
++
++/* Where if anywhere is the i8259 connect in external int mode */
++static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
++
++void __init enable_IO_APIC(void)
++{
++      int i8259_apic, i8259_pin;
++      int apic;
++
++      if (!legacy_pic->nr_legacy_irqs)
++              return;
++
++      for(apic = 0; apic < nr_ioapics; apic++) {
++              int pin;
++              /* See if any of the pins is in ExtINT mode */
++              for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
++                      struct IO_APIC_route_entry entry;
++                      entry = ioapic_read_entry(apic, pin);
++
++                      /* If the interrupt line is enabled and in ExtInt mode
++                       * I have found the pin where the i8259 is connected.
++                       */
++                      if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
++                              ioapic_i8259.apic = apic;
++                              ioapic_i8259.pin  = pin;
++                              goto found_i8259;
++                      }
++              }
++      }
++ found_i8259:
++      /* Look to see what if the MP table has reported the ExtINT */
++      /* If we could not find the appropriate pin by looking at the ioapic
++       * the i8259 probably is not connected the ioapic but give the
++       * mptable a chance anyway.
++       */
++      i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
++      i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
++      /* Trust the MP table if nothing is setup in the hardware */
++      if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
++              printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
++              ioapic_i8259.pin  = i8259_pin;
++              ioapic_i8259.apic = i8259_apic;
++      }
++      /* Complain if the MP table and the hardware disagree */
++      if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
++              (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
++      {
++              printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
++      }
++
++      /*
++       * Do not trust the IO-APIC being empty at bootup
++       */
++      clear_IO_APIC();
++}
++
++/*
++ * Not an __init, needed by the reboot code
++ */
++void disable_IO_APIC(void)
++{
++      /*
++       * Clear the IO-APIC before rebooting:
++       */
++      clear_IO_APIC();
++
++      if (!legacy_pic->nr_legacy_irqs)
++              return;
++
++      /*
++       * If the i8259 is routed through an IOAPIC
++       * Put that IOAPIC in virtual wire mode
++       * so legacy interrupts can be delivered.
++       *
++       * With interrupt-remapping, for now we will use virtual wire A mode,
++       * as virtual wire B is little complex (need to configure both
++       * IOAPIC RTE as well as interrupt-remapping table entry).
++       * As this gets called during crash dump, keep this simple for now.
++       */
++      if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
++              struct IO_APIC_route_entry entry;
++
++              memset(&entry, 0, sizeof(entry));
++              entry.mask            = 0; /* Enabled */
++              entry.trigger         = 0; /* Edge */
++              entry.irr             = 0;
++              entry.polarity        = 0; /* High */
++              entry.delivery_status = 0;
++              entry.dest_mode       = 0; /* Physical */
++              entry.delivery_mode   = dest_ExtINT; /* ExtInt */
++              entry.vector          = 0;
++              entry.dest            = read_apic_id();
++
++              /*
++               * Add it to the IO-APIC irq-routing table:
++               */
++              ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
++      }
++
++      /*
++       * Use virtual wire A mode when interrupt remapping is enabled.
++       */
++      if (cpu_has_apic || apic_from_smp_config())
++              disconnect_bsp_APIC(!intr_remapping_enabled &&
++                              ioapic_i8259.pin != -1);
++}
++
++#ifdef CONFIG_X86_32
++/*
++ * function to set the IO-APIC physical IDs based on the
++ * values stored in the MPC table.
++ *
++ * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
++ */
++void __init setup_ioapic_ids_from_mpc_nocheck(void)
++{
++      union IO_APIC_reg_00 reg_00;
++      physid_mask_t phys_id_present_map;
++      int apic_id;
++      int i;
++      unsigned char old_id;
++      unsigned long flags;
++
++      /*
++       * This is broken; anything with a real cpu count has to
++       * circumvent this idiocy regardless.
++       */
++      apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
++
++      /*
++       * Set the IOAPIC ID to the value stored in the MPC table.
++       */
++      for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
++
++              /* Read the register 0 value */
++              raw_spin_lock_irqsave(&ioapic_lock, flags);
++              reg_00.raw = io_apic_read(apic_id, 0);
++              raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++
++              old_id = mp_ioapics[apic_id].apicid;
++
++              if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
++                      printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
++                              apic_id, mp_ioapics[apic_id].apicid);
++                      printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
++                              reg_00.bits.ID);
++                      mp_ioapics[apic_id].apicid = reg_00.bits.ID;
++              }
++
++              /*
++               * Sanity check, is the ID really free? Every APIC in a
++               * system must have a unique ID or we get lots of nice
++               * 'stuck on smp_invalidate_needed IPI wait' messages.
++               */
++              if (apic->check_apicid_used(&phys_id_present_map,
++                                      mp_ioapics[apic_id].apicid)) {
++                      printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
++                              apic_id, mp_ioapics[apic_id].apicid);
++                      for (i = 0; i < get_physical_broadcast(); i++)
++                              if (!physid_isset(i, phys_id_present_map))
++                                      break;
++                      if (i >= get_physical_broadcast())
++                              panic("Max APIC ID exceeded!\n");
++                      printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
++                              i);
++                      physid_set(i, phys_id_present_map);
++                      mp_ioapics[apic_id].apicid = i;
++              } else {
++                      physid_mask_t tmp;
++                      apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
++                      apic_printk(APIC_VERBOSE, "Setting %d in the "
++                                      "phys_id_present_map\n",
++                                      mp_ioapics[apic_id].apicid);
++                      physids_or(phys_id_present_map, phys_id_present_map, tmp);
++              }
++
++              /*
++               * We need to adjust the IRQ routing table
++               * if the ID changed.
++               */
++              if (old_id != mp_ioapics[apic_id].apicid)
++                      for (i = 0; i < mp_irq_entries; i++)
++                              if (mp_irqs[i].dstapic == old_id)
++                                      mp_irqs[i].dstapic
++                                              = mp_ioapics[apic_id].apicid;
++
++              /*
++               * Update the ID register according to the right value
++               * from the MPC table if they are different.
++               */
++              if (mp_ioapics[apic_id].apicid == reg_00.bits.ID)
++                      continue;
++
++              apic_printk(APIC_VERBOSE, KERN_INFO
++                      "...changing IO-APIC physical APIC ID to %d ...",
++                      mp_ioapics[apic_id].apicid);
++
++              reg_00.bits.ID = mp_ioapics[apic_id].apicid;
++              raw_spin_lock_irqsave(&ioapic_lock, flags);
++              io_apic_write(apic_id, 0, reg_00.raw);
++              raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++
++              /*
++               * Sanity check
++               */
++              raw_spin_lock_irqsave(&ioapic_lock, flags);
++              reg_00.raw = io_apic_read(apic_id, 0);
++              raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++              if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
++                      printk("could not set ID!\n");
++              else
++                      apic_printk(APIC_VERBOSE, " ok.\n");
++      }
++}
++
++void __init setup_ioapic_ids_from_mpc(void)
++{
++
++      if (acpi_ioapic)
++              return;
++      /*
++       * Don't check I/O APIC IDs for xAPIC systems.  They have
++       * no meaning without the serial APIC bus.
++       */
++      if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
++              || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
++              return;
++      setup_ioapic_ids_from_mpc_nocheck();
++}
++#endif
++
++int no_timer_check __initdata;
++
++static int __init notimercheck(char *s)
++{
++      no_timer_check = 1;
++      return 1;
++}
++__setup("no_timer_check", notimercheck);
++
++/*
++ * There is a nasty bug in some older SMP boards, their mptable lies
++ * about the timer IRQ. We do the following to work around the situation:
++ *
++ *    - timer IRQ defaults to IO-APIC IRQ
++ *    - if this function detects that timer IRQs are defunct, then we fall
++ *      back to ISA timer IRQs
++ */
++static int __init timer_irq_works(void)
++{
++      unsigned long t1 = jiffies;
++      unsigned long flags;
++
++      if (no_timer_check)
++              return 1;
++
++      local_save_flags(flags);
++      local_irq_enable();
++      /* Let ten ticks pass... */
++      mdelay((10 * 1000) / HZ);
++      local_irq_restore(flags);
++
++      /*
++       * Expect a few ticks at least, to be sure some possible
++       * glue logic does not lock up after one or two first
++       * ticks in a non-ExtINT mode.  Also the local APIC
++       * might have cached one ExtINT interrupt.  Finally, at
++       * least one tick may be lost due to delays.
++       */
++
++      /* jiffies wrap? */
++      if (time_after(jiffies, t1 + 4))
++              return 1;
++      return 0;
++}
++
++/*
++ * In the SMP+IOAPIC case it might happen that there are an unspecified
++ * number of pending IRQ events unhandled. These cases are very rare,
++ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
++ * better to do it this way as thus we do not have to be aware of
++ * 'pending' interrupts in the IRQ path, except at this point.
++ */
++/*
++ * Edge triggered needs to resend any interrupt
++ * that was delayed but this is now handled in the device
++ * independent code.
++ */
++
++/*
++ * Starting up a edge-triggered IO-APIC interrupt is
++ * nasty - we need to make sure that we get the edge.
++ * If it is already asserted for some reason, we need
++ * return 1 to indicate that is was pending.
++ *
++ * This is not complete - we should be able to fake
++ * an edge even if it isn't on the 8259A...
++ */
++
++static unsigned int startup_ioapic_irq(struct irq_data *data)
++{
++      int was_pending = 0, irq = data->irq;
++      unsigned long flags;
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      if (irq < legacy_pic->nr_legacy_irqs) {
++              legacy_pic->mask(irq);
++              if (legacy_pic->irq_pending(irq))
++                      was_pending = 1;
++      }
++      __unmask_ioapic(data->chip_data);
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++
++      return was_pending;
++}
++
++static int ioapic_retrigger_irq(struct irq_data *data)
++{
++      struct irq_cfg *cfg = data->chip_data;
++      unsigned long flags;
++
++      raw_spin_lock_irqsave(&vector_lock, flags);
++      apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
++      raw_spin_unlock_irqrestore(&vector_lock, flags);
++
++      return 1;
++}
++
++/*
++ * Level and edge triggered IO-APIC interrupts need different handling,
++ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
++ * handled with the level-triggered descriptor, but that one has slightly
++ * more overhead. Level-triggered interrupts cannot be handled with the
++ * edge-triggered handler, without risking IRQ storms and other ugly
++ * races.
++ */
++
++#ifdef CONFIG_SMP
++void send_cleanup_vector(struct irq_cfg *cfg)
++{
++      cpumask_var_t cleanup_mask;
++
++      if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
++              unsigned int i;
++              for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
++                      apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
++      } else {
++              cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
++              apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
++              free_cpumask_var(cleanup_mask);
++      }
++      cfg->move_in_progress = 0;
++}
++
++static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
++{
++      int apic, pin;
++      struct irq_pin_list *entry;
++      u8 vector = cfg->vector;
++
++      for_each_irq_pin(entry, cfg->irq_2_pin) {
++              unsigned int reg;
++
++              apic = entry->apic;
++              pin = entry->pin;
++              /*
++               * With interrupt-remapping, destination information comes
++               * from interrupt-remapping table entry.
++               */
++              if (!irq_remapped(cfg))
++                      io_apic_write(apic, 0x11 + pin*2, dest);
++              reg = io_apic_read(apic, 0x10 + pin*2);
++              reg &= ~IO_APIC_REDIR_VECTOR_MASK;
++              reg |= vector;
++              io_apic_modify(apic, 0x10 + pin*2, reg);
++      }
++}
++
++/*
++ * Either sets data->affinity to a valid value, and returns
++ * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
++ * leaves data->affinity untouched.
++ */
++int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
++                        unsigned int *dest_id)
++{
++      struct irq_cfg *cfg = data->chip_data;
++
++      if (!cpumask_intersects(mask, cpu_online_mask))
++              return -1;
++
++      if (assign_irq_vector(data->irq, data->chip_data, mask))
++              return -1;
++
++      cpumask_copy(data->affinity, mask);
++
++      *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
++      return 0;
++}
++
++static int
++ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
++                  bool force)
++{
++      unsigned int dest, irq = data->irq;
++      unsigned long flags;
++      int ret;
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      ret = __ioapic_set_affinity(data, mask, &dest);
++      if (!ret) {
++              /* Only the high 8 bits are valid. */
++              dest = SET_APIC_LOGICAL_ID(dest);
++              __target_IO_APIC_irq(irq, dest, data->chip_data);
++      }
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++      return ret;
++}
++
++#ifdef CONFIG_INTR_REMAP
++
++/*
++ * Migrate the IO-APIC irq in the presence of intr-remapping.
++ *
++ * For both level and edge triggered, irq migration is a simple atomic
++ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
++ *
++ * For level triggered, we eliminate the io-apic RTE modification (with the
++ * updated vector information), by using a virtual vector (io-apic pin number).
++ * Real vector that is used for interrupting cpu will be coming from
++ * the interrupt-remapping table entry.
++ */
++static int
++ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
++                     bool force)
++{
++      struct irq_cfg *cfg = data->chip_data;
++      unsigned int dest, irq = data->irq;
++      struct irte irte;
++
++      if (!cpumask_intersects(mask, cpu_online_mask))
++              return -EINVAL;
++
++      if (get_irte(irq, &irte))
++              return -EBUSY;
++
++      if (assign_irq_vector(irq, cfg, mask))
++              return -EBUSY;
++
++      dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
++
++      irte.vector = cfg->vector;
++      irte.dest_id = IRTE_DEST(dest);
++
++      /*
++       * Modified the IRTE and flushes the Interrupt entry cache.
++       */
++      modify_irte(irq, &irte);
++
++      if (cfg->move_in_progress)
++              send_cleanup_vector(cfg);
++
++      cpumask_copy(data->affinity, mask);
++      return 0;
++}
++
++#else
++static inline int
++ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
++                     bool force)
++{
++      return 0;
++}
++#endif
++
++asmlinkage void smp_irq_move_cleanup_interrupt(void)
++{
++      unsigned vector, me;
++
++      ack_APIC_irq();
++      exit_idle();
++      irq_enter();
++
++      me = smp_processor_id();
++      for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
++              unsigned int irq;
++              unsigned int irr;
++              struct irq_desc *desc;
++              struct irq_cfg *cfg;
++              irq = __this_cpu_read(vector_irq[vector]);
++
++              if (irq == -1)
++                      continue;
++
++              desc = irq_to_desc(irq);
++              if (!desc)
++                      continue;
++
++              cfg = irq_cfg(irq);
++              raw_spin_lock(&desc->lock);
++
++              /*
++               * Check if the irq migration is in progress. If so, we
++               * haven't received the cleanup request yet for this irq.
++               */
++              if (cfg->move_in_progress)
++                      goto unlock;
++
++              if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
++                      goto unlock;
++
++              irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
++              /*
++               * Check if the vector that needs to be cleanedup is
++               * registered at the cpu's IRR. If so, then this is not
++               * the best time to clean it up. Lets clean it up in the
++               * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
++               * to myself.
++               */
++              if (irr  & (1 << (vector % 32))) {
++                      apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
++                      goto unlock;
++              }
++              __this_cpu_write(vector_irq[vector], -1);
++unlock:
++              raw_spin_unlock(&desc->lock);
++      }
++
++      irq_exit();
++}
++
++static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
++{
++      unsigned me;
++
++      if (likely(!cfg->move_in_progress))
++              return;
++
++      me = smp_processor_id();
++
++      if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
++              send_cleanup_vector(cfg);
++}
++
++static void irq_complete_move(struct irq_cfg *cfg)
++{
++      __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
++}
++
++void irq_force_complete_move(int irq)
++{
++      struct irq_cfg *cfg = irq_get_chip_data(irq);
++
++      if (!cfg)
++              return;
++
++      __irq_complete_move(cfg, cfg->vector);
++}
++#else
++static inline void irq_complete_move(struct irq_cfg *cfg) { }
++#endif
++
++static void ack_apic_edge(struct irq_data *data)
++{
++      irq_complete_move(data->chip_data);
++      irq_move_irq(data);
++      ack_APIC_irq();
++}
++
++atomic_t irq_mis_count;
++
++/*
++ * IO-APIC versions below 0x20 don't support EOI register.
++ * For the record, here is the information about various versions:
++ *     0Xh     82489DX
++ *     1Xh     I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
++ *     2Xh     I/O(x)APIC which is PCI 2.2 Compliant
++ *     30h-FFh Reserved
++ *
++ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
++ * version as 0x2. This is an error with documentation and these ICH chips
++ * use io-apic's of version 0x20.
++ *
++ * For IO-APIC's with EOI register, we use that to do an explicit EOI.
++ * Otherwise, we simulate the EOI message manually by changing the trigger
++ * mode to edge and then back to level, with RTE being masked during this.
++*/
++static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
++{
++      struct irq_pin_list *entry;
++      unsigned long flags;
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      for_each_irq_pin(entry, cfg->irq_2_pin) {
++              if (mp_ioapics[entry->apic].apicver >= 0x20) {
++                      /*
++                       * Intr-remapping uses pin number as the virtual vector
++                       * in the RTE. Actual vector is programmed in
++                       * intr-remapping table entry. Hence for the io-apic
++                       * EOI we use the pin number.
++                       */
++                      if (irq_remapped(cfg))
++                              io_apic_eoi(entry->apic, entry->pin);
++                      else
++                              io_apic_eoi(entry->apic, cfg->vector);
++              } else {
++                      __mask_and_edge_IO_APIC_irq(entry);
++                      __unmask_and_level_IO_APIC_irq(entry);
++              }
++      }
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++}
++
++static void ack_apic_level(struct irq_data *data)
++{
++      struct irq_cfg *cfg = data->chip_data;
++      int i, do_unmask_irq = 0, irq = data->irq;
++      unsigned long v;
++
++      irq_complete_move(cfg);
++#ifdef CONFIG_GENERIC_PENDING_IRQ
++      /* If we are moving the irq we need to mask it */
++      if (unlikely(irqd_is_setaffinity_pending(data))) {
++              do_unmask_irq = 1;
++              mask_ioapic(cfg);
++      }
++#endif
++
++      /*
++       * It appears there is an erratum which affects at least version 0x11
++       * of I/O APIC (that's the 82093AA and cores integrated into various
++       * chipsets).  Under certain conditions a level-triggered interrupt is
++       * erroneously delivered as edge-triggered one but the respective IRR
++       * bit gets set nevertheless.  As a result the I/O unit expects an EOI
++       * message but it will never arrive and further interrupts are blocked
++       * from the source.  The exact reason is so far unknown, but the
++       * phenomenon was observed when two consecutive interrupt requests
++       * from a given source get delivered to the same CPU and the source is
++       * temporarily disabled in between.
++       *
++       * A workaround is to simulate an EOI message manually.  We achieve it
++       * by setting the trigger mode to edge and then to level when the edge
++       * trigger mode gets detected in the TMR of a local APIC for a
++       * level-triggered interrupt.  We mask the source for the time of the
++       * operation to prevent an edge-triggered interrupt escaping meanwhile.
++       * The idea is from Manfred Spraul.  --macro
++       *
++       * Also in the case when cpu goes offline, fixup_irqs() will forward
++       * any unhandled interrupt on the offlined cpu to the new cpu
++       * destination that is handling the corresponding interrupt. This
++       * interrupt forwarding is done via IPI's. Hence, in this case also
++       * level-triggered io-apic interrupt will be seen as an edge
++       * interrupt in the IRR. And we can't rely on the cpu's EOI
++       * to be broadcasted to the IO-APIC's which will clear the remoteIRR
++       * corresponding to the level-triggered interrupt. Hence on IO-APIC's
++       * supporting EOI register, we do an explicit EOI to clear the
++       * remote IRR and on IO-APIC's which don't have an EOI register,
++       * we use the above logic (mask+edge followed by unmask+level) from
++       * Manfred Spraul to clear the remote IRR.
++       */
++      i = cfg->vector;
++      v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
++
++      /*
++       * We must acknowledge the irq before we move it or the acknowledge will
++       * not propagate properly.
++       */
++      ack_APIC_irq();
++
++      /*
++       * Tail end of clearing remote IRR bit (either by delivering the EOI
++       * message via io-apic EOI register write or simulating it using
++       * mask+edge followed by unnask+level logic) manually when the
++       * level triggered interrupt is seen as the edge triggered interrupt
++       * at the cpu.
++       */
++      if (!(v & (1 << (i & 0x1f)))) {
++              atomic_inc(&irq_mis_count);
++
++              eoi_ioapic_irq(irq, cfg);
++      }
++
++      /* Now we can move and renable the irq */
++      if (unlikely(do_unmask_irq)) {
++              /* Only migrate the irq if the ack has been received.
++               *
++               * On rare occasions the broadcast level triggered ack gets
++               * delayed going to ioapics, and if we reprogram the
++               * vector while Remote IRR is still set the irq will never
++               * fire again.
++               *
++               * To prevent this scenario we read the Remote IRR bit
++               * of the ioapic.  This has two effects.
++               * - On any sane system the read of the ioapic will
++               *   flush writes (and acks) going to the ioapic from
++               *   this cpu.
++               * - We get to see if the ACK has actually been delivered.
++               *
++               * Based on failed experiments of reprogramming the
++               * ioapic entry from outside of irq context starting
++               * with masking the ioapic entry and then polling until
++               * Remote IRR was clear before reprogramming the
++               * ioapic I don't trust the Remote IRR bit to be
++               * completey accurate.
++               *
++               * However there appears to be no other way to plug
++               * this race, so if the Remote IRR bit is not
++               * accurate and is causing problems then it is a hardware bug
++               * and you can go talk to the chipset vendor about it.
++               */
++              if (!io_apic_level_ack_pending(cfg))
++                      irq_move_masked_irq(data);
++              unmask_ioapic(cfg);
++      }
++}
++
++#ifdef CONFIG_INTR_REMAP
++static void ir_ack_apic_edge(struct irq_data *data)
++{
++      ack_APIC_irq();
++}
++
++static void ir_ack_apic_level(struct irq_data *data)
++{
++      ack_APIC_irq();
++      eoi_ioapic_irq(data->irq, data->chip_data);
++}
++#endif /* CONFIG_INTR_REMAP */
++
++static struct irq_chip ioapic_chip __read_mostly = {
++      .name                   = "IO-APIC",
++      .irq_startup            = startup_ioapic_irq,
++      .irq_mask               = mask_ioapic_irq,
++      .irq_unmask             = unmask_ioapic_irq,
++      .irq_ack                = ack_apic_edge,
++      .irq_eoi                = ack_apic_level,
++#ifdef CONFIG_SMP
++      .irq_set_affinity       = ioapic_set_affinity,
++#endif
++      .irq_retrigger          = ioapic_retrigger_irq,
++};
++
++static struct irq_chip ir_ioapic_chip __read_mostly = {
++      .name                   = "IR-IO-APIC",
++      .irq_startup            = startup_ioapic_irq,
++      .irq_mask               = mask_ioapic_irq,
++      .irq_unmask             = unmask_ioapic_irq,
++#ifdef CONFIG_INTR_REMAP
++      .irq_ack                = ir_ack_apic_edge,
++      .irq_eoi                = ir_ack_apic_level,
++#ifdef CONFIG_SMP
++      .irq_set_affinity       = ir_ioapic_set_affinity,
++#endif
++#endif
++      .irq_retrigger          = ioapic_retrigger_irq,
++};
++#endif /* !CONFIG_XEN */
++
++static inline void init_IO_APIC_traps(void)
++{
++      struct irq_cfg *cfg;
++      unsigned int irq;
++
++      /*
++       * NOTE! The local APIC isn't very good at handling
++       * multiple interrupts at the same interrupt level.
++       * As the interrupt level is determined by taking the
++       * vector number and shifting that right by 4, we
++       * want to spread these out a bit so that they don't
++       * all fall in the same interrupt level.
++       *
++       * Also, we've got to be careful not to trash gate
++       * 0x80, because int 0x80 is hm, kind of importantish. ;)
++       */
++      for_each_active_irq(irq) {
++#ifdef CONFIG_XEN
++              if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
++                      continue;
++#endif
++              cfg = irq_get_chip_data(irq);
++              if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
++                      /*
++                       * Hmm.. We don't have an entry for this,
++                       * so default to an old-fashioned 8259
++                       * interrupt if we can..
++                       */
++                      if (irq < legacy_pic->nr_legacy_irqs)
++                              legacy_pic->make_irq(irq);
++                      else
++                              /* Strange. Oh, well.. */
++                              irq_set_chip(irq, &no_irq_chip);
++              }
++      }
++}
++
++#ifndef CONFIG_XEN
++/*
++ * The local APIC irq-chip implementation:
++ */
++
++static void mask_lapic_irq(struct irq_data *data)
++{
++      unsigned long v;
++
++      v = apic_read(APIC_LVT0);
++      apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
++}
++
++static void unmask_lapic_irq(struct irq_data *data)
++{
++      unsigned long v;
++
++      v = apic_read(APIC_LVT0);
++      apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
++}
++
++static void ack_lapic_irq(struct irq_data *data)
++{
++      ack_APIC_irq();
++}
++
++static struct irq_chip lapic_chip __read_mostly = {
++      .name           = "local-APIC",
++      .irq_mask       = mask_lapic_irq,
++      .irq_unmask     = unmask_lapic_irq,
++      .irq_ack        = ack_lapic_irq,
++};
++
++static void lapic_register_intr(int irq)
++{
++      irq_clear_status_flags(irq, IRQ_LEVEL);
++      irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
++                                    "edge");
++}
++
++/*
++ * This looks a bit hackish but it's about the only one way of sending
++ * a few INTA cycles to 8259As and any associated glue logic.  ICR does
++ * not support the ExtINT mode, unfortunately.  We need to send these
++ * cycles as some i82489DX-based boards have glue logic that keeps the
++ * 8259A interrupt line asserted until INTA.  --macro
++ */
++static inline void __init unlock_ExtINT_logic(void)
++{
++      int apic, pin, i;
++      struct IO_APIC_route_entry entry0, entry1;
++      unsigned char save_control, save_freq_select;
++
++      pin  = find_isa_irq_pin(8, mp_INT);
++      if (pin == -1) {
++              WARN_ON_ONCE(1);
++              return;
++      }
++      apic = find_isa_irq_apic(8, mp_INT);
++      if (apic == -1) {
++              WARN_ON_ONCE(1);
++              return;
++      }
++
++      entry0 = ioapic_read_entry(apic, pin);
++      clear_IO_APIC_pin(apic, pin);
++
++      memset(&entry1, 0, sizeof(entry1));
++
++      entry1.dest_mode = 0;                   /* physical delivery */
++      entry1.mask = 0;                        /* unmask IRQ now */
++      entry1.dest = hard_smp_processor_id();
++      entry1.delivery_mode = dest_ExtINT;
++      entry1.polarity = entry0.polarity;
++      entry1.trigger = 0;
++      entry1.vector = 0;
++
++      ioapic_write_entry(apic, pin, entry1);
++
++      save_control = CMOS_READ(RTC_CONTROL);
++      save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
++      CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
++                 RTC_FREQ_SELECT);
++      CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
++
++      i = 100;
++      while (i-- > 0) {
++              mdelay(10);
++              if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
++                      i -= 10;
++      }
++
++      CMOS_WRITE(save_control, RTC_CONTROL);
++      CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
++      clear_IO_APIC_pin(apic, pin);
++
++      ioapic_write_entry(apic, pin, entry0);
++}
++
++static int disable_timer_pin_1 __initdata;
++/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
++static int __init disable_timer_pin_setup(char *arg)
++{
++      disable_timer_pin_1 = 1;
++      return 0;
++}
++early_param("disable_timer_pin_1", disable_timer_pin_setup);
++
++int timer_through_8259 __initdata;
++
++/*
++ * This code may look a bit paranoid, but it's supposed to cooperate with
++ * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
++ * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
++ * fanatically on his truly buggy board.
++ *
++ * FIXME: really need to revamp this for all platforms.
++ */
++static inline void __init check_timer(void)
++{
++      struct irq_cfg *cfg = irq_get_chip_data(0);
++      int node = cpu_to_node(0);
++      int apic1, pin1, apic2, pin2;
++      unsigned long flags;
++      int no_pin1 = 0;
++
++      local_irq_save(flags);
++
++      /*
++       * get/set the timer IRQ vector:
++       */
++      legacy_pic->mask(0);
++      assign_irq_vector(0, cfg, apic->target_cpus());
++
++      /*
++       * As IRQ0 is to be enabled in the 8259A, the virtual
++       * wire has to be disabled in the local APIC.  Also
++       * timer interrupts need to be acknowledged manually in
++       * the 8259A for the i82489DX when using the NMI
++       * watchdog as that APIC treats NMIs as level-triggered.
++       * The AEOI mode will finish them in the 8259A
++       * automatically.
++       */
++      apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
++      legacy_pic->init(1);
++
++      pin1  = find_isa_irq_pin(0, mp_INT);
++      apic1 = find_isa_irq_apic(0, mp_INT);
++      pin2  = ioapic_i8259.pin;
++      apic2 = ioapic_i8259.apic;
++
++      apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
++                  "apic1=%d pin1=%d apic2=%d pin2=%d\n",
++                  cfg->vector, apic1, pin1, apic2, pin2);
++
++      /*
++       * Some BIOS writers are clueless and report the ExtINTA
++       * I/O APIC input from the cascaded 8259A as the timer
++       * interrupt input.  So just in case, if only one pin
++       * was found above, try it both directly and through the
++       * 8259A.
++       */
++      if (pin1 == -1) {
++              if (intr_remapping_enabled)
++                      panic("BIOS bug: timer not connected to IO-APIC");
++              pin1 = pin2;
++              apic1 = apic2;
++              no_pin1 = 1;
++      } else if (pin2 == -1) {
++              pin2 = pin1;
++              apic2 = apic1;
++      }
++
++      if (pin1 != -1) {
++              /*
++               * Ok, does IRQ0 through the IOAPIC work?
++               */
++              if (no_pin1) {
++                      add_pin_to_irq_node(cfg, node, apic1, pin1);
++                      setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
++              } else {
++                      /* for edge trigger, setup_ioapic_irq already
++                       * leave it unmasked.
++                       * so only need to unmask if it is level-trigger
++                       * do we really have level trigger timer?
++                       */
++                      int idx;
++                      idx = find_irq_entry(apic1, pin1, mp_INT);
++                      if (idx != -1 && irq_trigger(idx))
++                              unmask_ioapic(cfg);
++              }
++              if (timer_irq_works()) {
++                      if (disable_timer_pin_1 > 0)
++                              clear_IO_APIC_pin(0, pin1);
++                      goto out;
++              }
++              if (intr_remapping_enabled)
++                      panic("timer doesn't work through Interrupt-remapped IO-APIC");
++              local_irq_disable();
++              clear_IO_APIC_pin(apic1, pin1);
++              if (!no_pin1)
++                      apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
++                                  "8254 timer not connected to IO-APIC\n");
++
++              apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
++                          "(IRQ0) through the 8259A ...\n");
++              apic_printk(APIC_QUIET, KERN_INFO
++                          "..... (found apic %d pin %d) ...\n", apic2, pin2);
++              /*
++               * legacy devices should be connected to IO APIC #0
++               */
++              replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
++              setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
++              legacy_pic->unmask(0);
++              if (timer_irq_works()) {
++                      apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
++                      timer_through_8259 = 1;
++                      goto out;
++              }
++              /*
++               * Cleanup, just in case ...
++               */
++              local_irq_disable();
++              legacy_pic->mask(0);
++              clear_IO_APIC_pin(apic2, pin2);
++              apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
++      }
++
++      apic_printk(APIC_QUIET, KERN_INFO
++                  "...trying to set up timer as Virtual Wire IRQ...\n");
++
++      lapic_register_intr(0);
++      apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
++      legacy_pic->unmask(0);
++
++      if (timer_irq_works()) {
++              apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
++              goto out;
++      }
++      local_irq_disable();
++      legacy_pic->mask(0);
++      apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
++      apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
++
++      apic_printk(APIC_QUIET, KERN_INFO
++                  "...trying to set up timer as ExtINT IRQ...\n");
++
++      legacy_pic->init(0);
++      legacy_pic->make_irq(0);
++      apic_write(APIC_LVT0, APIC_DM_EXTINT);
++
++      unlock_ExtINT_logic();
++
++      if (timer_irq_works()) {
++              apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
++              goto out;
++      }
++      local_irq_disable();
++      apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
++      panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
++              "report.  Then try booting with the 'noapic' option.\n");
++out:
++      local_irq_restore(flags);
++}
++#else
++#define check_timer() ((void)0)
++#endif
++
++/*
++ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
++ * to devices.  However there may be an I/O APIC pin available for
++ * this interrupt regardless.  The pin may be left unconnected, but
++ * typically it will be reused as an ExtINT cascade interrupt for
++ * the master 8259A.  In the MPS case such a pin will normally be
++ * reported as an ExtINT interrupt in the MP table.  With ACPI
++ * there is no provision for ExtINT interrupts, and in the absence
++ * of an override it would be treated as an ordinary ISA I/O APIC
++ * interrupt, that is edge-triggered and unmasked by default.  We
++ * used to do this, but it caused problems on some systems because
++ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
++ * the same ExtINT cascade interrupt to drive the local APIC of the
++ * bootstrap processor.  Therefore we refrain from routing IRQ2 to
++ * the I/O APIC in all cases now.  No actual device should request
++ * it anyway.  --macro
++ */
++#define PIC_IRQS      (1UL << PIC_CASCADE_IR)
++
++void __init setup_IO_APIC(void)
++{
++
++      /*
++       * calling enable_IO_APIC() is moved to setup_local_APIC for BP
++       */
++      io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
++
++      apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
++      /*
++         * Set up IO-APIC IRQ routing.
++         */
++#ifndef CONFIG_XEN
++      x86_init.mpparse.setup_ioapic_ids();
++
++      sync_Arb_IDs();
++#endif
++      setup_IO_APIC_irqs();
++      init_IO_APIC_traps();
++      if (legacy_pic->nr_legacy_irqs)
++              check_timer();
++}
++
++/*
++ *      Called after all the initialization is done. If we didn't find any
++ *      APIC bugs then we can allow the modify fast path
++ */
++
++static int __init io_apic_bug_finalize(void)
++{
++      if (sis_apic_bug == -1)
++              sis_apic_bug = 0;
++#ifdef CONFIG_X86_XEN
++      if (is_initial_xendomain()) {
++              struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
++              op.u.platform_quirk.quirk_id = sis_apic_bug ?
++                      QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
++              VOID(HYPERVISOR_platform_op(&op));
++      }
++#endif
++      return 0;
++}
++
++late_initcall(io_apic_bug_finalize);
++
++#ifndef CONFIG_XEN
++static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS];
++
++static void suspend_ioapic(int ioapic_id)
++{
++      struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
++      int i;
++
++      if (!saved_data)
++              return;
++
++      for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
++              saved_data[i] = ioapic_read_entry(ioapic_id, i);
++}
++
++static int ioapic_suspend(void)
++{
++      int ioapic_id;
++
++      for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++)
++              suspend_ioapic(ioapic_id);
++
++      return 0;
++}
++
++static void resume_ioapic(int ioapic_id)
++{
++      struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
++      unsigned long flags;
++      union IO_APIC_reg_00 reg_00;
++      int i;
++
++      if (!saved_data)
++              return;
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      reg_00.raw = io_apic_read(ioapic_id, 0);
++      if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) {
++              reg_00.bits.ID = mp_ioapics[ioapic_id].apicid;
++              io_apic_write(ioapic_id, 0, reg_00.raw);
++      }
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++      for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
++              ioapic_write_entry(ioapic_id, i, saved_data[i]);
++}
++
++static void ioapic_resume(void)
++{
++      int ioapic_id;
++
++      for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
++              resume_ioapic(ioapic_id);
++}
++
++static struct syscore_ops ioapic_syscore_ops = {
++      .suspend = ioapic_suspend,
++      .resume = ioapic_resume,
++};
++
++static int __init ioapic_init_ops(void)
++{
++      int i;
++
++      for (i = 0; i < nr_ioapics; i++) {
++              unsigned int size;
++
++              size = nr_ioapic_registers[i]
++                      * sizeof(struct IO_APIC_route_entry);
++              ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL);
++              if (!ioapic_saved_data[i])
++                      pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
++      }
++
++      register_syscore_ops(&ioapic_syscore_ops);
++
++      return 0;
++}
++
++device_initcall(ioapic_init_ops);
++
++/*
++ * Dynamic irq allocate and deallocation
++ */
++unsigned int create_irq_nr(unsigned int from, int node)
++{
++      struct irq_cfg *cfg;
++      unsigned long flags;
++      unsigned int ret = 0;
++      int irq;
++
++      if (from < nr_irqs_gsi)
++              from = nr_irqs_gsi;
++
++      irq = alloc_irq_from(from, node);
++      if (irq < 0)
++              return 0;
++      cfg = alloc_irq_cfg(irq, node);
++      if (!cfg) {
++              free_irq_at(irq, NULL);
++              return 0;
++      }
++
++      raw_spin_lock_irqsave(&vector_lock, flags);
++      if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
++              ret = irq;
++      raw_spin_unlock_irqrestore(&vector_lock, flags);
++
++      if (ret) {
++              irq_set_chip_data(irq, cfg);
++              irq_clear_status_flags(irq, IRQ_NOREQUEST);
++      } else {
++              free_irq_at(irq, cfg);
++      }
++      return ret;
++}
++
++int create_irq(void)
++{
++      int node = cpu_to_node(0);
++      unsigned int irq_want;
++      int irq;
++
++      irq_want = nr_irqs_gsi;
++      irq = create_irq_nr(irq_want, node);
++
++      if (irq == 0)
++              irq = -1;
++
++      return irq;
++}
++
++void destroy_irq(unsigned int irq)
++{
++      struct irq_cfg *cfg = irq_get_chip_data(irq);
++      unsigned long flags;
++
++      irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
++
++      if (irq_remapped(cfg))
++              free_irte(irq);
++      raw_spin_lock_irqsave(&vector_lock, flags);
++      __clear_irq_vector(irq, cfg);
++      raw_spin_unlock_irqrestore(&vector_lock, flags);
++      free_irq_at(irq, cfg);
++}
++#endif /* !CONFIG_XEN */
++
++/*
++ * MSI message composition
++ */
++#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
++static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
++                         struct msi_msg *msg, u8 hpet_id)
++{
++      struct irq_cfg *cfg;
++      int err;
++      unsigned dest;
++
++      if (disable_apic)
++              return -ENXIO;
++
++      cfg = irq_cfg(irq);
++      err = assign_irq_vector(irq, cfg, apic->target_cpus());
++      if (err)
++              return err;
++
++      dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
++
++      if (irq_remapped(cfg)) {
++              struct irte irte;
++              int ir_index;
++              u16 sub_handle;
++
++              ir_index = map_irq_to_irte_handle(irq, &sub_handle);
++              BUG_ON(ir_index == -1);
++
++              prepare_irte(&irte, cfg->vector, dest);
++
++              /* Set source-id of interrupt request */
++              if (pdev)
++                      set_msi_sid(&irte, pdev);
++              else
++                      set_hpet_sid(&irte, hpet_id);
++
++              modify_irte(irq, &irte);
++
++              msg->address_hi = MSI_ADDR_BASE_HI;
++              msg->data = sub_handle;
++              msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
++                                MSI_ADDR_IR_SHV |
++                                MSI_ADDR_IR_INDEX1(ir_index) |
++                                MSI_ADDR_IR_INDEX2(ir_index);
++      } else {
++              if (x2apic_enabled())
++                      msg->address_hi = MSI_ADDR_BASE_HI |
++                                        MSI_ADDR_EXT_DEST_ID(dest);
++              else
++                      msg->address_hi = MSI_ADDR_BASE_HI;
++
++              msg->address_lo =
++                      MSI_ADDR_BASE_LO |
++                      ((apic->irq_dest_mode == 0) ?
++                              MSI_ADDR_DEST_MODE_PHYSICAL:
++                              MSI_ADDR_DEST_MODE_LOGICAL) |
++                      ((apic->irq_delivery_mode != dest_LowestPrio) ?
++                              MSI_ADDR_REDIRECTION_CPU:
++                              MSI_ADDR_REDIRECTION_LOWPRI) |
++                      MSI_ADDR_DEST_ID(dest);
++
++              msg->data =
++                      MSI_DATA_TRIGGER_EDGE |
++                      MSI_DATA_LEVEL_ASSERT |
++                      ((apic->irq_delivery_mode != dest_LowestPrio) ?
++                              MSI_DATA_DELIVERY_FIXED:
++                              MSI_DATA_DELIVERY_LOWPRI) |
++                      MSI_DATA_VECTOR(cfg->vector);
++      }
++      return err;
++}
++
++#ifdef CONFIG_SMP
++static int
++msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
++{
++      struct irq_cfg *cfg = data->chip_data;
++      struct msi_msg msg;
++      unsigned int dest;
++
++      if (__ioapic_set_affinity(data, mask, &dest))
++              return -1;
++
++      __get_cached_msi_msg(data->msi_desc, &msg);
++
++      msg.data &= ~MSI_DATA_VECTOR_MASK;
++      msg.data |= MSI_DATA_VECTOR(cfg->vector);
++      msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
++      msg.address_lo |= MSI_ADDR_DEST_ID(dest);
++
++      __write_msi_msg(data->msi_desc, &msg);
++
++      return 0;
++}
++#ifdef CONFIG_INTR_REMAP
++/*
++ * Migrate the MSI irq to another cpumask. This migration is
++ * done in the process context using interrupt-remapping hardware.
++ */
++static int
++ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
++                  bool force)
++{
++      struct irq_cfg *cfg = data->chip_data;
++      unsigned int dest, irq = data->irq;
++      struct irte irte;
++
++      if (get_irte(irq, &irte))
++              return -1;
++
++      if (__ioapic_set_affinity(data, mask, &dest))
++              return -1;
++
++      irte.vector = cfg->vector;
++      irte.dest_id = IRTE_DEST(dest);
++
++      /*
++       * atomically update the IRTE with the new destination and vector.
++       */
++      modify_irte(irq, &irte);
++
++      /*
++       * After this point, all the interrupts will start arriving
++       * at the new destination. So, time to cleanup the previous
++       * vector allocation.
++       */
++      if (cfg->move_in_progress)
++              send_cleanup_vector(cfg);
++
++      return 0;
++}
++
++#endif
++#endif /* CONFIG_SMP */
++
++/*
++ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
++ * which implement the MSI or MSI-X Capability Structure.
++ */
++static struct irq_chip msi_chip = {
++      .name                   = "PCI-MSI",
++      .irq_unmask             = unmask_msi_irq,
++      .irq_mask               = mask_msi_irq,
++      .irq_ack                = ack_apic_edge,
++#ifdef CONFIG_SMP
++      .irq_set_affinity       = msi_set_affinity,
++#endif
++      .irq_retrigger          = ioapic_retrigger_irq,
++};
++
++static struct irq_chip msi_ir_chip = {
++      .name                   = "IR-PCI-MSI",
++      .irq_unmask             = unmask_msi_irq,
++      .irq_mask               = mask_msi_irq,
++#ifdef CONFIG_INTR_REMAP
++      .irq_ack                = ir_ack_apic_edge,
++#ifdef CONFIG_SMP
++      .irq_set_affinity       = ir_msi_set_affinity,
++#endif
++#endif
++      .irq_retrigger          = ioapic_retrigger_irq,
++};
++
++/*
++ * Map the PCI dev to the corresponding remapping hardware unit
++ * and allocate 'nvec' consecutive interrupt-remapping table entries
++ * in it.
++ */
++static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
++{
++      struct intel_iommu *iommu;
++      int index;
++
++      iommu = map_dev_to_ir(dev);
++      if (!iommu) {
++              printk(KERN_ERR
++                     "Unable to map PCI %s to iommu\n", pci_name(dev));
++              return -ENOENT;
++      }
++
++      index = alloc_irte(iommu, irq, nvec);
++      if (index < 0) {
++              printk(KERN_ERR
++                     "Unable to allocate %d IRTE for PCI %s\n", nvec,
++                     pci_name(dev));
++              return -ENOSPC;
++      }
++      return index;
++}
++
++static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
++{
++      struct irq_chip *chip = &msi_chip;
++      struct msi_msg msg;
++      int ret;
++
++      ret = msi_compose_msg(dev, irq, &msg, -1);
++      if (ret < 0)
++              return ret;
++
++      irq_set_msi_desc(irq, msidesc);
++      write_msi_msg(irq, &msg);
++
++      if (irq_remapped(irq_get_chip_data(irq))) {
++              irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
++              chip = &msi_ir_chip;
++      }
++
++      irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
++
++      dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
++
++      return 0;
++}
++
++int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
++{
++      int node, ret, sub_handle, index = 0;
++      unsigned int irq, irq_want;
++      struct msi_desc *msidesc;
++      struct intel_iommu *iommu = NULL;
++
++      /* x86 doesn't support multiple MSI yet */
++      if (type == PCI_CAP_ID_MSI && nvec > 1)
++              return 1;
++
++      node = dev_to_node(&dev->dev);
++      irq_want = nr_irqs_gsi;
++      sub_handle = 0;
++      list_for_each_entry(msidesc, &dev->msi_list, list) {
++              irq = create_irq_nr(irq_want, node);
++              if (irq == 0)
++                      return -1;
++              irq_want = irq + 1;
++              if (!intr_remapping_enabled)
++                      goto no_ir;
++
++              if (!sub_handle) {
++                      /*
++                       * allocate the consecutive block of IRTE's
++                       * for 'nvec'
++                       */
++                      index = msi_alloc_irte(dev, irq, nvec);
++                      if (index < 0) {
++                              ret = index;
++                              goto error;
++                      }
++              } else {
++                      iommu = map_dev_to_ir(dev);
++                      if (!iommu) {
++                              ret = -ENOENT;
++                              goto error;
++                      }
++                      /*
++                       * setup the mapping between the irq and the IRTE
++                       * base index, the sub_handle pointing to the
++                       * appropriate interrupt remap table entry.
++                       */
++                      set_irte_irq(irq, iommu, index, sub_handle);
++              }
++no_ir:
++              ret = setup_msi_irq(dev, msidesc, irq);
++              if (ret < 0)
++                      goto error;
++              sub_handle++;
++      }
++      return 0;
++
++error:
++      destroy_irq(irq);
++      return ret;
++}
++
++void native_teardown_msi_irq(unsigned int irq)
++{
++      destroy_irq(irq);
++}
++
++#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
++#ifdef CONFIG_SMP
++static int
++dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
++                    bool force)
++{
++      struct irq_cfg *cfg = data->chip_data;
++      unsigned int dest, irq = data->irq;
++      struct msi_msg msg;
++
++      if (__ioapic_set_affinity(data, mask, &dest))
++              return -1;
++
++      dmar_msi_read(irq, &msg);
++
++      msg.data &= ~MSI_DATA_VECTOR_MASK;
++      msg.data |= MSI_DATA_VECTOR(cfg->vector);
++      msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
++      msg.address_lo |= MSI_ADDR_DEST_ID(dest);
++      msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
++
++      dmar_msi_write(irq, &msg);
++
++      return 0;
++}
++
++#endif /* CONFIG_SMP */
++
++static struct irq_chip dmar_msi_type = {
++      .name                   = "DMAR_MSI",
++      .irq_unmask             = dmar_msi_unmask,
++      .irq_mask               = dmar_msi_mask,
++      .irq_ack                = ack_apic_edge,
++#ifdef CONFIG_SMP
++      .irq_set_affinity       = dmar_msi_set_affinity,
++#endif
++      .irq_retrigger          = ioapic_retrigger_irq,
++};
++
++int arch_setup_dmar_msi(unsigned int irq)
++{
++      int ret;
++      struct msi_msg msg;
++
++      ret = msi_compose_msg(NULL, irq, &msg, -1);
++      if (ret < 0)
++              return ret;
++      dmar_msi_write(irq, &msg);
++      irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
++                                    "edge");
++      return 0;
++}
++#endif
++
++#ifdef CONFIG_HPET_TIMER
++
++#ifdef CONFIG_SMP
++static int hpet_msi_set_affinity(struct irq_data *data,
++                               const struct cpumask *mask, bool force)
++{
++      struct irq_cfg *cfg = data->chip_data;
++      struct msi_msg msg;
++      unsigned int dest;
++
++      if (__ioapic_set_affinity(data, mask, &dest))
++              return -1;
++
++      hpet_msi_read(data->handler_data, &msg);
++
++      msg.data &= ~MSI_DATA_VECTOR_MASK;
++      msg.data |= MSI_DATA_VECTOR(cfg->vector);
++      msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
++      msg.address_lo |= MSI_ADDR_DEST_ID(dest);
++
++      hpet_msi_write(data->handler_data, &msg);
++
++      return 0;
++}
++
++#endif /* CONFIG_SMP */
++
++static struct irq_chip ir_hpet_msi_type = {
++      .name                   = "IR-HPET_MSI",
++      .irq_unmask             = hpet_msi_unmask,
++      .irq_mask               = hpet_msi_mask,
++#ifdef CONFIG_INTR_REMAP
++      .irq_ack                = ir_ack_apic_edge,
++#ifdef CONFIG_SMP
++      .irq_set_affinity       = ir_msi_set_affinity,
++#endif
++#endif
++      .irq_retrigger          = ioapic_retrigger_irq,
++};
++
++static struct irq_chip hpet_msi_type = {
++      .name = "HPET_MSI",
++      .irq_unmask = hpet_msi_unmask,
++      .irq_mask = hpet_msi_mask,
++      .irq_ack = ack_apic_edge,
++#ifdef CONFIG_SMP
++      .irq_set_affinity = hpet_msi_set_affinity,
++#endif
++      .irq_retrigger = ioapic_retrigger_irq,
++};
++
++int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
++{
++      struct irq_chip *chip = &hpet_msi_type;
++      struct msi_msg msg;
++      int ret;
++
++      if (intr_remapping_enabled) {
++              struct intel_iommu *iommu = map_hpet_to_ir(id);
++              int index;
++
++              if (!iommu)
++                      return -1;
++
++              index = alloc_irte(iommu, irq, 1);
++              if (index < 0)
++                      return -1;
++      }
++
++      ret = msi_compose_msg(NULL, irq, &msg, id);
++      if (ret < 0)
++              return ret;
++
++      hpet_msi_write(irq_get_handler_data(irq), &msg);
++      irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
++      if (irq_remapped(irq_get_chip_data(irq)))
++              chip = &ir_hpet_msi_type;
++
++      irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
++      return 0;
++}
++#endif
++
++#endif /* CONFIG_PCI_MSI */
++/*
++ * Hypertransport interrupt support
++ */
++#ifdef CONFIG_HT_IRQ
++
++#ifdef CONFIG_SMP
++
++static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
++{
++      struct ht_irq_msg msg;
++      fetch_ht_irq_msg(irq, &msg);
++
++      msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
++      msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
++
++      msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
++      msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
++
++      write_ht_irq_msg(irq, &msg);
++}
++
++static int
++ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
++{
++      struct irq_cfg *cfg = data->chip_data;
++      unsigned int dest;
++
++      if (__ioapic_set_affinity(data, mask, &dest))
++              return -1;
++
++      target_ht_irq(data->irq, dest, cfg->vector);
++      return 0;
++}
++
++#endif
++
++static struct irq_chip ht_irq_chip = {
++      .name                   = "PCI-HT",
++      .irq_mask               = mask_ht_irq,
++      .irq_unmask             = unmask_ht_irq,
++      .irq_ack                = ack_apic_edge,
++#ifdef CONFIG_SMP
++      .irq_set_affinity       = ht_set_affinity,
++#endif
++      .irq_retrigger          = ioapic_retrigger_irq,
++};
++
++int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
++{
++      struct irq_cfg *cfg;
++      int err;
++
++      if (disable_apic)
++              return -ENXIO;
++
++      cfg = irq_cfg(irq);
++      err = assign_irq_vector(irq, cfg, apic->target_cpus());
++      if (!err) {
++              struct ht_irq_msg msg;
++              unsigned dest;
++
++              dest = apic->cpu_mask_to_apicid_and(cfg->domain,
++                                                  apic->target_cpus());
++
++              msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
++
++              msg.address_lo =
++                      HT_IRQ_LOW_BASE |
++                      HT_IRQ_LOW_DEST_ID(dest) |
++                      HT_IRQ_LOW_VECTOR(cfg->vector) |
++                      ((apic->irq_dest_mode == 0) ?
++                              HT_IRQ_LOW_DM_PHYSICAL :
++                              HT_IRQ_LOW_DM_LOGICAL) |
++                      HT_IRQ_LOW_RQEOI_EDGE |
++                      ((apic->irq_delivery_mode != dest_LowestPrio) ?
++                              HT_IRQ_LOW_MT_FIXED :
++                              HT_IRQ_LOW_MT_ARBITRATED) |
++                      HT_IRQ_LOW_IRQ_MASKED;
++
++              write_ht_irq_msg(irq, &msg);
++
++              irq_set_chip_and_handler_name(irq, &ht_irq_chip,
++                                            handle_edge_irq, "edge");
++
++              dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
++      }
++      return err;
++}
++#endif /* CONFIG_HT_IRQ */
++
++static int
++io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
++{
++      struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
++      int ret;
++
++      if (!cfg)
++              return -EINVAL;
++      ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
++      if (!ret)
++              setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg,
++                               attr->trigger, attr->polarity);
++      return ret;
++}
++
++int io_apic_setup_irq_pin_once(unsigned int irq, int node,
++                             struct io_apic_irq_attr *attr)
++{
++      unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
++      int ret;
++
++      /* Avoid redundant programming */
++      if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) {
++              pr_debug("Pin %d-%d already programmed\n",
++                       mp_ioapics[id].apicid, pin);
++              return 0;
++      }
++      ret = io_apic_setup_irq_pin(irq, node, attr);
++      if (!ret)
++              set_bit(pin, mp_ioapic_routing[id].pin_programmed);
++      return ret;
++}
++
++static int __init io_apic_get_redir_entries(int ioapic)
++{
++      union IO_APIC_reg_01    reg_01;
++      unsigned long flags;
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      reg_01.raw = io_apic_read(ioapic, 1);
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++
++      /* The register returns the maximum index redir index
++       * supported, which is one less than the total number of redir
++       * entries.
++       */
++      return reg_01.bits.entries + 1;
++}
++
++#ifndef CONFIG_XEN
++static void __init probe_nr_irqs_gsi(void)
++{
++      int nr;
++
++      nr = gsi_top + NR_IRQS_LEGACY;
++      if (nr > nr_irqs_gsi)
++              nr_irqs_gsi = nr;
++
++      printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
++}
++
++int get_nr_irqs_gsi(void)
++{
++      return nr_irqs_gsi;
++}
++
++#ifdef CONFIG_SPARSE_IRQ
++int __init arch_probe_nr_irqs(void)
++{
++      int nr;
++
++      if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
++              nr_irqs = NR_VECTORS * nr_cpu_ids;
++
++      nr = nr_irqs_gsi + 8 * nr_cpu_ids;
++#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
++      /*
++       * for MSI and HT dyn irq
++       */
++      nr += nr_irqs_gsi * 16;
++#endif
++      if (nr < nr_irqs)
++              nr_irqs = nr;
++
++      return NR_IRQS_LEGACY;
++}
++#endif
++#endif /* CONFIG_XEN */
++
++int io_apic_set_pci_routing(struct device *dev, int irq,
++                          struct io_apic_irq_attr *irq_attr)
++{
++      int node;
++
++#ifdef CONFIG_XEN
++      if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) {
++              apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n",
++                          irq_attr->ioapic, irq);
++              return -EINVAL;
++      }
++#endif
++      if (!IO_APIC_IRQ(irq)) {
++              apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
++                          irq_attr->ioapic);
++              return -EINVAL;
++      }
++
++      node = dev ? dev_to_node(dev) : cpu_to_node(0);
++
++      return io_apic_setup_irq_pin_once(irq, node, irq_attr);
++}
++
++#ifdef CONFIG_X86_32
++#ifndef CONFIG_XEN
++static int __init io_apic_get_unique_id(int ioapic, int apic_id)
++{
++      union IO_APIC_reg_00 reg_00;
++      static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
++      physid_mask_t tmp;
++      unsigned long flags;
++      int i = 0;
++
++      /*
++       * The P4 platform supports up to 256 APIC IDs on two separate APIC
++       * buses (one for LAPICs, one for IOAPICs), where predecessors only
++       * supports up to 16 on one shared APIC bus.
++       *
++       * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
++       *      advantage of new APIC bus architecture.
++       */
++
++      if (physids_empty(apic_id_map))
++              apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      reg_00.raw = io_apic_read(ioapic, 0);
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++
++      if (apic_id >= get_physical_broadcast()) {
++              printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
++                      "%d\n", ioapic, apic_id, reg_00.bits.ID);
++              apic_id = reg_00.bits.ID;
++      }
++
++      /*
++       * Every APIC in a system must have a unique ID or we get lots of nice
++       * 'stuck on smp_invalidate_needed IPI wait' messages.
++       */
++      if (apic->check_apicid_used(&apic_id_map, apic_id)) {
++
++              for (i = 0; i < get_physical_broadcast(); i++) {
++                      if (!apic->check_apicid_used(&apic_id_map, i))
++                              break;
++              }
++
++              if (i == get_physical_broadcast())
++                      panic("Max apic_id exceeded!\n");
++
++              printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
++                      "trying %d\n", ioapic, apic_id, i);
++
++              apic_id = i;
++      }
++
++      apic->apicid_to_cpu_present(apic_id, &tmp);
++      physids_or(apic_id_map, apic_id_map, tmp);
++
++      if (reg_00.bits.ID != apic_id) {
++              reg_00.bits.ID = apic_id;
++
++              raw_spin_lock_irqsave(&ioapic_lock, flags);
++              io_apic_write(ioapic, 0, reg_00.raw);
++              reg_00.raw = io_apic_read(ioapic, 0);
++              raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++
++              /* Sanity check */
++              if (reg_00.bits.ID != apic_id) {
++                      printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
++                      return -1;
++              }
++      }
++
++      apic_printk(APIC_VERBOSE, KERN_INFO
++                      "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
++
++      return apic_id;
++}
++#endif
++
++static u8 __init io_apic_unique_id(u8 id)
++{
++#ifndef CONFIG_XEN
++      if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
++          !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
++              return io_apic_get_unique_id(nr_ioapics, id);
++      else
++#endif
++              return id;
++}
++#else
++static u8 __init io_apic_unique_id(u8 id)
++{
++      int i;
++      DECLARE_BITMAP(used, 256);
++
++      bitmap_zero(used, 256);
++      for (i = 0; i < nr_ioapics; i++) {
++              struct mpc_ioapic *ia = &mp_ioapics[i];
++              __set_bit(ia->apicid, used);
++      }
++      if (!test_bit(id, used))
++              return id;
++      return find_first_zero_bit(used, 256);
++}
++#endif
++
++static int __init io_apic_get_version(int ioapic)
++{
++      union IO_APIC_reg_01    reg_01;
++      unsigned long flags;
++
++      raw_spin_lock_irqsave(&ioapic_lock, flags);
++      reg_01.raw = io_apic_read(ioapic, 1);
++      raw_spin_unlock_irqrestore(&ioapic_lock, flags);
++
++      return reg_01.bits.version;
++}
++
++int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
++{
++      int ioapic, pin, idx;
++
++      if (skip_ioapic_setup)
++              return -1;
++
++      ioapic = mp_find_ioapic(gsi);
++      if (ioapic < 0)
++              return -1;
++
++      pin = mp_find_ioapic_pin(ioapic, gsi);
++      if (pin < 0)
++              return -1;
++
++      idx = find_irq_entry(ioapic, pin, mp_INT);
++      if (idx < 0)
++              return -1;
++
++      *trigger = irq_trigger(idx);
++      *polarity = irq_polarity(idx);
++      return 0;
++}
++
++#ifndef CONFIG_XEN
++/*
++ * This function currently is only a helper for the i386 smp boot process where
++ * we need to reprogram the ioredtbls to cater for the cpus which have come online
++ * so mask in all cases should simply be apic->target_cpus()
++ */
++#ifdef CONFIG_SMP
++void __init setup_ioapic_dest(void)
++{
++      int pin, ioapic, irq, irq_entry;
++      const struct cpumask *mask;
++      struct irq_data *idata;
++
++      if (skip_ioapic_setup == 1)
++              return;
++
++      for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
++      for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
++              irq_entry = find_irq_entry(ioapic, pin, mp_INT);
++              if (irq_entry == -1)
++                      continue;
++              irq = pin_2_irq(irq_entry, ioapic, pin);
++
++              if ((ioapic > 0) && (irq > 16))
++                      continue;
++
++              idata = irq_get_irq_data(irq);
++
++              /*
++               * Honour affinities which have been set in early boot
++               */
++              if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
++                      mask = idata->affinity;
++              else
++                      mask = apic->target_cpus();
++
++              if (intr_remapping_enabled)
++                      ir_ioapic_set_affinity(idata, mask, false);
++              else
++                      ioapic_set_affinity(idata, mask, false);
++      }
++
++}
++#endif
++
++#define IOAPIC_RESOURCE_NAME_SIZE 11
++
++static struct resource *ioapic_resources;
++
++static struct resource * __init ioapic_setup_resources(int nr_ioapics)
++{
++      unsigned long n;
++      struct resource *res;
++      char *mem;
++      int i;
++
++      if (nr_ioapics <= 0)
++              return NULL;
++
++      n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
++      n *= nr_ioapics;
++
++      mem = alloc_bootmem(n);
++      res = (void *)mem;
++
++      mem += sizeof(struct resource) * nr_ioapics;
++
++      for (i = 0; i < nr_ioapics; i++) {
++              res[i].name = mem;
++              res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
++              snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
++              mem += IOAPIC_RESOURCE_NAME_SIZE;
++      }
++
++      ioapic_resources = res;
++
++      return res;
++}
++
++void __init ioapic_and_gsi_init(void)
++{
++      unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
++      struct resource *ioapic_res;
++      int i;
++
++      ioapic_res = ioapic_setup_resources(nr_ioapics);
++      for (i = 0; i < nr_ioapics; i++) {
++              if (smp_found_config) {
++                      ioapic_phys = mp_ioapics[i].apicaddr;
++#ifdef CONFIG_X86_32
++                      if (!ioapic_phys) {
++                              printk(KERN_ERR
++                                     "WARNING: bogus zero IO-APIC "
++                                     "address found in MPTABLE, "
++                                     "disabling IO/APIC support!\n");
++                              smp_found_config = 0;
++                              skip_ioapic_setup = 1;
++                              goto fake_ioapic_page;
++                      }
++#endif
++              } else {
++#ifdef CONFIG_X86_32
++fake_ioapic_page:
++#endif
++                      ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
++                      ioapic_phys = __pa(ioapic_phys);
++              }
++              set_fixmap_nocache(idx, ioapic_phys);
++              apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
++                      __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
++                      ioapic_phys);
++              idx++;
++
++              ioapic_res->start = ioapic_phys;
++              ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
++              ioapic_res++;
++      }
++
++      probe_nr_irqs_gsi();
++}
++
++void __init ioapic_insert_resources(void)
++{
++      int i;
++      struct resource *r = ioapic_resources;
++
++      if (!r) {
++              if (nr_ioapics > 0)
++                      printk(KERN_ERR
++                              "IO APIC resources couldn't be allocated.\n");
++              return;
++      }
++
++      for (i = 0; i < nr_ioapics; i++) {
++              insert_resource(&iomem_resource, r);
++              r++;
++      }
++}
++#endif /* !CONFIG_XEN */
++
++int mp_find_ioapic(u32 gsi)
++{
++      int i = 0;
++
++      if (nr_ioapics == 0)
++              return -1;
++
++      /* Find the IOAPIC that manages this GSI. */
++      for (i = 0; i < nr_ioapics; i++) {
++              if ((gsi >= mp_gsi_routing[i].gsi_base)
++                  && (gsi <= mp_gsi_routing[i].gsi_end))
++                      return i;
++      }
++
++      printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
++      return -1;
++}
++
++int mp_find_ioapic_pin(int ioapic, u32 gsi)
++{
++      if (WARN_ON(ioapic == -1))
++              return -1;
++      if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end))
++              return -1;
++
++      return gsi - mp_gsi_routing[ioapic].gsi_base;
++}
++
++static __init int bad_ioapic(unsigned long address)
++{
++      if (nr_ioapics >= MAX_IO_APICS) {
++              printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
++                     "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
++              return 1;
++      }
++      if (!address) {
++              printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
++                     " found in table, skipping!\n");
++              return 1;
++      }
++      return 0;
++}
++
++void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
++{
++      int idx = 0;
++      int entries;
++
++      if (bad_ioapic(address))
++              return;
++
++      idx = nr_ioapics;
++
++      mp_ioapics[idx].type = MP_IOAPIC;
++      mp_ioapics[idx].flags = MPC_APIC_USABLE;
++      mp_ioapics[idx].apicaddr = address;
++
++#ifndef CONFIG_XEN
++      set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
++#endif
++      mp_ioapics[idx].apicid = io_apic_unique_id(id);
++      mp_ioapics[idx].apicver = io_apic_get_version(idx);
++
++      /*
++       * Build basic GSI lookup table to facilitate gsi->io_apic lookups
++       * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
++       */
++      entries = io_apic_get_redir_entries(idx);
++      mp_gsi_routing[idx].gsi_base = gsi_base;
++      mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
++
++      /*
++       * The number of IO-APIC IRQ registers (== #pins):
++       */
++      nr_ioapic_registers[idx] = entries;
++
++      if (mp_gsi_routing[idx].gsi_end >= gsi_top)
++              gsi_top = mp_gsi_routing[idx].gsi_end + 1;
++
++      printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
++             "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
++             mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
++             mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end);
++
++      nr_ioapics++;
++}
++
++#ifdef CONFIG_X86_MRST
++/* Enable IOAPIC early just for system timer */
++void __init pre_init_apic_IRQ0(void)
++{
++      struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
++
++      printk(KERN_INFO "Early APIC setup for system timer0\n");
++#ifndef CONFIG_SMP
++      physid_set_mask_of_physid(boot_cpu_physical_apicid,
++                                       &phys_cpu_present_map);
++#endif
++      setup_local_APIC();
++
++      io_apic_setup_irq_pin(0, 0, &attr);
++      irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
++                                    "edge");
++}
++#endif
diff --cc arch/x86/kernel/apic/ipi-xen.c

index 0000000,0000000..a3ee607

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/apic/ipi-xen.c
@@@ -1,0 -1,0 +1,43 @@@
++#include <linux/cpumask.h>
++#include <linux/interrupt.h>
++
++#include <asm/smp.h>
++#include <asm/ipi.h>
++
++#ifdef CONFIG_SMP
++#include <xen/evtchn.h>
++
++void xen_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
++{
++      unsigned int cpu, this_cpu = smp_processor_id();
++
++      WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
++      for_each_cpu_and(cpu, cpumask, cpu_online_mask)
++              if (cpu != this_cpu)
++                      notify_remote_via_ipi(vector, cpu);
++}
++
++void xen_send_IPI_mask(const struct cpumask *cpumask, int vector)
++{
++      unsigned int cpu;
++
++      WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
++      for_each_cpu_and(cpu, cpumask, cpu_online_mask)
++              notify_remote_via_ipi(vector, cpu);
++}
++
++void xen_send_IPI_allbutself(int vector)
++{
++      xen_send_IPI_mask_allbutself(cpu_online_mask, vector);
++}
++
++void xen_send_IPI_all(int vector)
++{
++      xen_send_IPI_mask(cpu_online_mask, vector);
++}
++
++void xen_send_IPI_self(int vector)
++{
++      notify_remote_via_ipi(vector, smp_processor_id());
++}
++#endif
diff --cc arch/x86/kernel/apic/probe_32-xen.c

index 0000000,0000000..8602fa9

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_32-xen.c
@@@ -1,0 -1,0 +1,57 @@@
++/*
++ * Default generic APIC driver. This handles up to 8 CPUs.
++ *
++ * Copyright 2003 Andi Kleen, SuSE Labs.
++ * Subject to the GNU Public License, v.2
++ *
++ * Generic x86 APIC driver probe layer.
++ */
++#include <linux/threads.h>
++#include <linux/cpumask.h>
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/ctype.h>
++#include <linux/init.h>
++#include <linux/errno.h>
++#include <asm/fixmap.h>
++#include <asm/mpspec.h>
++#include <asm/apicdef.h>
++#include <asm/apic.h>
++#include <asm/setup.h>
++
++#include <linux/smp.h>
++#include <asm/ipi.h>
++
++#include <linux/interrupt.h>
++#include <asm/acpi.h>
++#include <asm/e820.h>
++
++static int xen_phys_pkg_id(int cpuid_apic, int index_msb)
++{
++      return cpuid_apic;
++}
++
++static struct apic apic_xen = {
++
++      .name                           = "default",
++
++      .irq_delivery_mode              = dest_LowestPrio,
++      /* logical delivery broadcast to all CPUs: */
++      .irq_dest_mode                  = 1,
++
++      .target_cpus                    = default_target_cpus,
++
++      .phys_pkg_id                    = xen_phys_pkg_id,
++
++#ifdef CONFIG_SMP
++      .send_IPI_mask                  = xen_send_IPI_mask,
++      .send_IPI_mask_allbutself       = xen_send_IPI_mask_allbutself,
++      .send_IPI_allbutself            = xen_send_IPI_allbutself,
++      .send_IPI_all                   = xen_send_IPI_all,
++      .send_IPI_self                  = xen_send_IPI_self,
++#endif
++};
++
++struct apic *apic = &apic_xen;
++EXPORT_SYMBOL_GPL(apic);
diff --cc arch/x86/kernel/apic/probe_32.c

index dd1b6ab,fc84c7b..029b0c1
--- 1/arch/x86/kernel/apic/probe_32.c
--- 2/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@@ -237,16 -265,16 +265,16 @@@ void __init generic_apic_probe(void
   int __init
   generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
   {
-       struct apic **drv;
+       int i;
   
-       for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
-               if (!((*drv)->mps_oem_check))
+       for (i = 0; apic_probe[i]; ++i) {
+               if (!apic_probe[i]->mps_oem_check)
                         continue;
-               if (!(*drv)->mps_oem_check(mpc, oem, productid))
+               if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
                         continue;
   
- -              if (!cmdline_apic) {
+ +              if (!cmdline_apic && apic == &apic_default) {
-                       apic = *drv;
+                       apic = apic_probe[i];
                         printk(KERN_INFO "Switched to APIC driver `%s'.\n",
                                apic->name);
                 }
@@@ -257,16 -285,16 +285,16 @@@
   
   int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
   {
-       struct apic **drv;
+       int i;
   
-       for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
-               if (!(*drv)->acpi_madt_oem_check)
+       for (i = 0; apic_probe[i]; ++i) {
+               if (!apic_probe[i]->acpi_madt_oem_check)
                         continue;
-               if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
+               if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
                         continue;
   
- -              if (!cmdline_apic) {
+ +              if (!cmdline_apic && apic == &apic_default) {
-                       apic = *drv;
+                       apic = apic_probe[i];
                         printk(KERN_INFO "Switched to APIC driver `%s'.\n",
                                apic->name);
                 }
diff --cc arch/x86/kernel/asm-offsets.c

index 4f13faf,4f13faf..2526d47
--- 1/arch/x86/kernel/asm-offsets.c
--- 2/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@@ -17,7 -17,7 +17,7 @@@
   #include <asm/bootparam.h>
   #include <asm/suspend.h>
   
--#ifdef CONFIG_XEN
++#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
   #include <xen/interface/xen.h>
   #endif
   
@@@ -55,7 -55,7 +55,7 @@@ void common(void) 
         OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
   #endif
   
--#ifdef CONFIG_XEN
++#ifdef CONFIG_PARAVIRT_XEN
         BLANK();
         OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
         OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
diff --cc arch/x86/kernel/asm-offsets_32.c

index c29d631,c29d631..b6f7b62
--- 1/arch/x86/kernel/asm-offsets_32.c
--- 2/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@@ -1,7 -1,7 +1,9 @@@
   #include <asm/ucontext.h>
   
++#ifdef CONFIG_LGUEST_GUEST
   #include <linux/lguest.h>
   #include "../../../drivers/lguest/lg.h"
++#endif
   
   /* workaround for a warning with -Wmissing-prototypes */
   void foo(void);
@@@ -55,9 -55,9 +57,19 @@@ void foo(void
         OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
         BLANK();
   
++#ifndef CONFIG_X86_NO_TSS
         /* Offset from the sysenter stack to tss.sp0 */
--      DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
++      DEFINE(SYSENTER_stack_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
                  sizeof(struct tss_struct));
++#else
++      /* sysenter stack points directly to sp0 */
++      DEFINE(SYSENTER_stack_sp0, 0);
++#endif
++
++#ifdef CONFIG_XEN
++      BLANK();
++      OFFSET(XEN_START_mfn_list, start_info, mfn_list);
++#endif
   
   #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
         BLANK();
diff --cc arch/x86/kernel/asm-offsets_64.c

index e72a119,e72a119..dbf8c97
--- 1/arch/x86/kernel/asm-offsets_64.c
--- 2/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@@ -69,8 -69,8 +69,10 @@@ int main(void
         BLANK();
   #undef ENTRY
   
++#ifndef CONFIG_X86_NO_TSS
         OFFSET(TSS_ist, tss_struct, x86_tss.ist);
         BLANK();
++#endif
   
         DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
   
diff --cc arch/x86/kernel/cpu/Makefile

index 6042981,3f0ebe4..7076ab6
--- 1/arch/x86/kernel/cpu/Makefile
--- 2/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@@ -33,6 -34,6 +34,9 @@@ obj-$(CONFIG_CPU_FREQ)                        += cpufreq
   
   obj-$(CONFIG_X86_LOCAL_APIC)          += perfctr-watchdog.o
   
++disabled-obj-$(CONFIG_XEN) := hypervisor.o mshyperv.o perfctr-watchdog.o \
++                            perf_event.o sched.o vmware.o
++
   quiet_cmd_mkcapflags = MKCAP   $@
         cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
   
diff --cc arch/x86/kernel/cpu/amd.c

index b13ed39,6f9d1f6..779d8ef
--- 1/arch/x86/kernel/cpu/amd.c
--- 2/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@@ -333,7 -333,7 +333,7 @@@ static void __cpuinit amd_detect_cmp(st
   int amd_get_nb_id(int cpu)
   {
         int id = 0;
--#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
         id = per_cpu(cpu_llc_id, cpu);
   #endif
         return id;
@@@ -432,7 -432,7 +432,7 @@@ static void __cpuinit early_init_amd(st
                     (c->x86_model == 8 && c->x86_mask >= 8))
                         set_cpu_cap(c, X86_FEATURE_K6_MTRR);
   #endif
--#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
++#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) && !defined(CONFIG_XEN)
         /* check CPU config space for extended APIC ID */
         if (cpu_has_apic && c->x86 >= 0xf) {
                 unsigned int val;
@@@ -505,18 -505,18 +505,26 @@@ static void __cpuinit init_amd(struct c
                         u64 val;
   
                         clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
++#ifndef CONFIG_XEN
                         if (!rdmsrl_amd_safe(0xc001100d, &val)) {
                                 val &= ~(1ULL << 32);
                                 wrmsrl_amd_safe(0xc001100d, val);
                         }
++#else
++                      pr_warning("Long-mode LAHF feature wrongly enabled -"
++                                 "hypervisor update needed\n");
++                      (void)&val;
++#endif
                 }
   
         }
         if (c->x86 >= 0x10)
                 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
   
++#ifndef CONFIG_XEN
         /* get apicid instead of initial apic id from cpuid */
         c->apicid = hard_smp_processor_id();
++#endif
   #else
   
         /*
@@@ -592,6 -592,6 +600,7 @@@
                 fam10h_check_enable_mmcfg();
         }
   
++#ifndef CONFIG_XEN
         if (c == &boot_cpu_data && c->x86 >= 0xf) {
                 unsigned long long tseg;
   
@@@ -611,12 -611,9 +620,10 @@@
                 }
         }
   #endif
++#endif
   
-       /*
-        * Family 0x12 and above processors have APIC timer
-        * running in deep C states.
-        */
-       if (c->x86 > 0x11)
+       /* As a rule processors have APIC timer running in deep C states */
+       if (c->x86 > 0xf && !cpu_has_amd_erratum(amd_erratum_400))
                 set_cpu_cap(c, X86_FEATURE_ARAT);
   
         /*
diff --cc arch/x86/kernel/cpu/bugs.c

index 525514c,c39576c..e6f3127
--- 1/arch/x86/kernel/cpu/bugs.c
--- 2/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@@ -17,14 -17,13 +17,15 @@@
   #include <asm/paravirt.h>
   #include <asm/alternative.h>
   
++#ifndef CONFIG_XEN
   static int __init no_halt(char *s)
   {
-       WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
         boot_cpu_data.hlt_works_ok = 0;
         return 1;
   }
   
   __setup("no-hlt", no_halt);
++#endif
   
   static int __init no_387(char *s)
   {
@@@ -80,13 -79,13 +81,16 @@@ static void __init check_fpu(void
                 : "=m" (*&fdiv_bug)
                 : "m" (*&x), "m" (*&y));
   
++#ifndef CONFIG_XEN
         boot_cpu_data.fdiv_bug = fdiv_bug;
         if (boot_cpu_data.fdiv_bug)
                 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
++#endif
   }
   
   static void __init check_hlt(void)
   {
++#ifndef CONFIG_XEN
         if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
                 return;
   
@@@ -100,6 -99,6 +104,7 @@@
         halt();
         halt();
         printk(KERN_CONT "OK.\n");
++#endif
   }
   
   /*
diff --cc arch/x86/kernel/cpu/bugs_64.c

index 04f0fe5,04f0fe5..25a2cda
--- 1/arch/x86/kernel/cpu/bugs_64.c
--- 2/arch/x86/kernel/cpu/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
@@@ -20,6 -20,6 +20,7 @@@ void __init check_bugs(void
   #endif
         alternative_instructions();
   
++#ifndef CONFIG_XEN
         /*
          * Make sure the first 2MB area is not mapped by huge pages
          * There are typically fixed size MTRRs in there and overlapping
@@@ -30,4 -30,4 +31,5 @@@
          */
         if (!direct_gbpages)
                 set_memory_4k((unsigned long)__va(0), 1);
++#endif
   }
diff --cc arch/x86/kernel/cpu/common-xen.c

index 0000000,0000000..676afdb

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/cpu/common-xen.c
@@@ -1,0 -1,0 +1,1376 @@@
++#include <linux/bootmem.h>
++#include <linux/linkage.h>
++#include <linux/bitops.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/percpu.h>
++#include <linux/string.h>
++#include <linux/delay.h>
++#include <linux/sched.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++#include <linux/smp.h>
++#include <linux/io.h>
++
++#include <asm/stackprotector.h>
++#include <asm/perf_event.h>
++#include <asm/mmu_context.h>
++#include <asm/hypervisor.h>
++#include <asm/processor.h>
++#include <asm/sections.h>
++#include <linux/topology.h>
++#include <linux/cpumask.h>
++#include <asm/pgtable.h>
++#include <asm/atomic.h>
++#include <asm/proto.h>
++#include <asm/setup.h>
++#include <asm/apic.h>
++#include <asm/desc.h>
++#include <asm/i387.h>
++#include <asm/mtrr.h>
++#include <linux/numa.h>
++#include <asm/asm.h>
++#include <asm/cpu.h>
++#include <asm/mce.h>
++#include <asm/msr.h>
++#include <asm/pat.h>
++
++#ifdef CONFIG_X86_LOCAL_APIC
++#include <asm/uv/uv.h>
++#endif
++
++#ifdef CONFIG_XEN
++#include <xen/interface/callback.h>
++#endif
++
++#include "cpu.h"
++
++/* all of these masks are initialized in setup_cpu_local_masks() */
++cpumask_var_t cpu_initialized_mask;
++#ifndef CONFIG_XEN
++cpumask_var_t cpu_callout_mask;
++cpumask_var_t cpu_callin_mask;
++
++/* representing cpus for which sibling maps can be computed */
++cpumask_var_t cpu_sibling_setup_mask;
++#endif
++
++/* correctly size the local cpu masks */
++void __init setup_cpu_local_masks(void)
++{
++      alloc_bootmem_cpumask_var(&cpu_initialized_mask);
++#ifndef CONFIG_XEN
++      alloc_bootmem_cpumask_var(&cpu_callin_mask);
++      alloc_bootmem_cpumask_var(&cpu_callout_mask);
++      alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
++#endif
++}
++
++static void __cpuinit default_init(struct cpuinfo_x86 *c)
++{
++#ifdef CONFIG_X86_64
++      cpu_detect_cache_sizes(c);
++#else
++      /* Not much we can do here... */
++      /* Check if at least it has cpuid */
++      if (c->cpuid_level == -1) {
++              /* No cpuid. It must be an ancient CPU */
++              if (c->x86 == 4)
++                      strcpy(c->x86_model_id, "486");
++              else if (c->x86 == 3)
++                      strcpy(c->x86_model_id, "386");
++      }
++#endif
++}
++
++static const struct cpu_dev __cpuinitconst default_cpu = {
++      .c_init         = default_init,
++      .c_vendor       = "Unknown",
++      .c_x86_vendor   = X86_VENDOR_UNKNOWN,
++};
++
++static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
++
++DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
++#ifdef CONFIG_X86_64
++      /*
++       * We need valid kernel segments for data and code in long mode too
++       * IRET will check the segment types  kkeil 2000/10/28
++       * Also sysret mandates a special GDT layout
++       *
++       * TLS descriptors are currently at a different place compared to i386.
++       * Hopefully nobody expects them at a fixed place (Wine?)
++       */
++      [GDT_ENTRY_KERNEL32_CS]         = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
++      [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
++      [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
++      [GDT_ENTRY_DEFAULT_USER32_CS]   = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
++      [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
++      [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
++#else
++      [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
++      [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
++      [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
++      [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
++#ifndef CONFIG_XEN
++      /*
++       * Segments used for calling PnP BIOS have byte granularity.
++       * They code segments and data segments have fixed 64k limits,
++       * the transfer segment sizes are set at run time.
++       */
++      /* 32-bit code */
++      [GDT_ENTRY_PNPBIOS_CS32]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
++      /* 16-bit code */
++      [GDT_ENTRY_PNPBIOS_CS16]        = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
++      /* 16-bit data */
++      [GDT_ENTRY_PNPBIOS_DS]          = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
++      /* 16-bit data */
++      [GDT_ENTRY_PNPBIOS_TS1]         = GDT_ENTRY_INIT(0x0092, 0, 0),
++      /* 16-bit data */
++      [GDT_ENTRY_PNPBIOS_TS2]         = GDT_ENTRY_INIT(0x0092, 0, 0),
++      /*
++       * The APM segments have byte granularity and their bases
++       * are set at run time.  All have 64k limits.
++       */
++      /* 32-bit code */
++      [GDT_ENTRY_APMBIOS_BASE]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
++      /* 16-bit code */
++      [GDT_ENTRY_APMBIOS_BASE+1]      = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
++      /* data */
++      [GDT_ENTRY_APMBIOS_BASE+2]      = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
++
++      [GDT_ENTRY_ESPFIX_SS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
++#endif
++      [GDT_ENTRY_PERCPU]              = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
++      GDT_STACK_CANARY_INIT
++#endif
++} };
++EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
++
++static int __init x86_xsave_setup(char *s)
++{
++      setup_clear_cpu_cap(X86_FEATURE_XSAVE);
++      setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
++      return 1;
++}
++__setup("noxsave", x86_xsave_setup);
++
++static int __init x86_xsaveopt_setup(char *s)
++{
++      setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
++      return 1;
++}
++__setup("noxsaveopt", x86_xsaveopt_setup);
++
++#ifdef CONFIG_X86_32
++static int cachesize_override __cpuinitdata = -1;
++
++static int __init cachesize_setup(char *str)
++{
++      get_option(&str, &cachesize_override);
++      return 1;
++}
++__setup("cachesize=", cachesize_setup);
++
++static int __init x86_fxsr_setup(char *s)
++{
++      setup_clear_cpu_cap(X86_FEATURE_FXSR);
++      setup_clear_cpu_cap(X86_FEATURE_XMM);
++      return 1;
++}
++__setup("nofxsr", x86_fxsr_setup);
++
++static int __init x86_sep_setup(char *s)
++{
++      setup_clear_cpu_cap(X86_FEATURE_SEP);
++      return 1;
++}
++__setup("nosep", x86_sep_setup);
++#endif
++
++#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
++/* Standard macro to see if a specific flag is changeable */
++static inline int flag_is_changeable_p(u32 flag)
++{
++      u32 f1, f2;
++
++      /*
++       * Cyrix and IDT cpus allow disabling of CPUID
++       * so the code below may return different results
++       * when it is executed before and after enabling
++       * the CPUID. Add "volatile" to not allow gcc to
++       * optimize the subsequent calls to this function.
++       */
++      asm volatile ("pushfl           \n\t"
++                    "pushfl           \n\t"
++                    "popl %0          \n\t"
++                    "movl %0, %1      \n\t"
++                    "xorl %2, %0      \n\t"
++                    "pushl %0         \n\t"
++                    "popfl            \n\t"
++                    "pushfl           \n\t"
++                    "popl %0          \n\t"
++                    "popfl            \n\t"
++
++                    : "=&r" (f1), "=&r" (f2)
++                    : "ir" (flag));
++
++      return ((f1^f2) & flag) != 0;
++}
++
++/* Probe for the CPUID instruction */
++static int __cpuinit have_cpuid_p(void)
++{
++      return flag_is_changeable_p(X86_EFLAGS_ID);
++}
++
++static int disable_x86_serial_nr __cpuinitdata = 1;
++
++static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
++{
++      unsigned long lo, hi;
++
++      if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
++              return;
++
++      /* Disable processor serial number: */
++
++      rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
++      lo |= 0x200000;
++      wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
++
++      printk(KERN_NOTICE "CPU serial number disabled.\n");
++      clear_cpu_cap(c, X86_FEATURE_PN);
++
++      /* Disabling the serial number may affect the cpuid level */
++      c->cpuid_level = cpuid_eax(0);
++}
++
++static int __init x86_serial_nr_setup(char *s)
++{
++      disable_x86_serial_nr = 0;
++      return 1;
++}
++__setup("serialnumber", x86_serial_nr_setup);
++#else
++static inline int flag_is_changeable_p(u32 flag)
++{
++      return 1;
++}
++/* Probe for the CPUID instruction */
++static inline int have_cpuid_p(void)
++{
++      return 1;
++}
++static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
++{
++}
++#endif
++
++/*
++ * Some CPU features depend on higher CPUID levels, which may not always
++ * be available due to CPUID level capping or broken virtualization
++ * software.  Add those features to this table to auto-disable them.
++ */
++struct cpuid_dependent_feature {
++      u32 feature;
++      u32 level;
++};
++
++static const struct cpuid_dependent_feature __cpuinitconst
++cpuid_dependent_features[] = {
++      { X86_FEATURE_MWAIT,            0x00000005 },
++      { X86_FEATURE_DCA,              0x00000009 },
++      { X86_FEATURE_XSAVE,            0x0000000d },
++      { 0, 0 }
++};
++
++static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
++{
++      const struct cpuid_dependent_feature *df;
++
++      for (df = cpuid_dependent_features; df->feature; df++) {
++
++              if (!cpu_has(c, df->feature))
++                      continue;
++              /*
++               * Note: cpuid_level is set to -1 if unavailable, but
++               * extended_extended_level is set to 0 if unavailable
++               * and the legitimate extended levels are all negative
++               * when signed; hence the weird messing around with
++               * signs here...
++               */
++              if (!((s32)df->level < 0 ?
++                   (u32)df->level > (u32)c->extended_cpuid_level :
++                   (s32)df->level > (s32)c->cpuid_level))
++                      continue;
++
++              clear_cpu_cap(c, df->feature);
++              if (!warn)
++                      continue;
++
++              printk(KERN_WARNING
++                     "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
++                              x86_cap_flags[df->feature], df->level);
++      }
++}
++
++/*
++ * Naming convention should be: <Name> [(<Codename>)]
++ * This table only is used unless init_<vendor>() below doesn't set it;
++ * in particular, if CPUID levels 0x80000002..4 are supported, this
++ * isn't used
++ */
++
++/* Look up CPU names by table lookup. */
++static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
++{
++      const struct cpu_model_info *info;
++
++      if (c->x86_model >= 16)
++              return NULL;    /* Range check */
++
++      if (!this_cpu)
++              return NULL;
++
++      info = this_cpu->c_models;
++
++      while (info && info->family) {
++              if (info->family == c->x86)
++                      return info->model_names[c->x86_model];
++              info++;
++      }
++      return NULL;            /* Not found */
++}
++
++__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
++__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
++
++void __ref load_percpu_segment(int cpu)
++{
++#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
++      static bool done;
++
++      if (!done) {
++              done = true;
++              adjust_boot_vcpu_info();
++      }
++#endif
++#ifdef CONFIG_X86_32
++      loadsegment(fs, __KERNEL_PERCPU);
++#else
++      loadsegment(gs, 0);
++#ifndef CONFIG_XEN
++      wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
++#else
++      if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
++                      (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)))
++              BUG();
++#endif
++#endif
++      load_stack_canary_segment();
++}
++
++/*
++ * Current gdt points %fs at the "master" per-cpu area: after this,
++ * it's on the real one.
++ */
++void switch_to_new_gdt(int cpu)
++{
++      struct desc_ptr gdt_descr;
++      unsigned long va, frames[16];
++      int f;
++
++      gdt_descr.address = (long)get_cpu_gdt_table(cpu);
++      gdt_descr.size = GDT_SIZE - 1;
++
++      for (va = gdt_descr.address, f = 0;
++           va < gdt_descr.address + gdt_descr.size;
++           va += PAGE_SIZE, f++) {
++              frames[f] = arbitrary_virt_to_mfn(va);
++              make_page_readonly((void *)va,
++                                 XENFEAT_writable_descriptor_tables);
++      }
++      if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8))
++              BUG();
++
++      /* Reload the per-cpu base */
++
++      load_percpu_segment(cpu);
++}
++
++static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
++
++static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
++{
++      unsigned int *v;
++      char *p, *q;
++
++      if (c->extended_cpuid_level < 0x80000004)
++              return;
++
++      v = (unsigned int *)c->x86_model_id;
++      cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
++      cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
++      cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
++      c->x86_model_id[48] = 0;
++
++      /*
++       * Intel chips right-justify this string for some dumb reason;
++       * undo that brain damage:
++       */
++      p = q = &c->x86_model_id[0];
++      while (*p == ' ')
++              p++;
++      if (p != q) {
++              while (*p)
++                      *q++ = *p++;
++              while (q <= &c->x86_model_id[48])
++                      *q++ = '\0';    /* Zero-pad the rest */
++      }
++}
++
++void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
++{
++      unsigned int n, dummy, ebx, ecx, edx, l2size;
++
++      n = c->extended_cpuid_level;
++
++      if (n >= 0x80000005) {
++              cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
++              c->x86_cache_size = (ecx>>24) + (edx>>24);
++#ifdef CONFIG_X86_64
++              /* On K8 L1 TLB is inclusive, so don't count it */
++              c->x86_tlbsize = 0;
++#endif
++      }
++
++      if (n < 0x80000006)     /* Some chips just has a large L1. */
++              return;
++
++      cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
++      l2size = ecx >> 16;
++
++#ifdef CONFIG_X86_64
++      c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
++#else
++      /* do processor-specific cache resizing */
++      if (this_cpu->c_size_cache)
++              l2size = this_cpu->c_size_cache(c, l2size);
++
++      /* Allow user to override all this if necessary. */
++      if (cachesize_override != -1)
++              l2size = cachesize_override;
++
++      if (l2size == 0)
++              return;         /* Again, no L2 cache is possible */
++#endif
++
++      c->x86_cache_size = l2size;
++}
++
++void __cpuinit detect_ht(struct cpuinfo_x86 *c)
++{
++#ifdef CONFIG_X86_HT
++      u32 eax, ebx, ecx, edx;
++      int index_msb, core_bits;
++      static bool printed;
++
++      if (!cpu_has(c, X86_FEATURE_HT))
++              return;
++
++      if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
++              goto out;
++
++      if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
++              return;
++
++      cpuid(1, &eax, &ebx, &ecx, &edx);
++
++      smp_num_siblings = (ebx & 0xff0000) >> 16;
++
++      if (smp_num_siblings == 1) {
++              printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
++              goto out;
++      }
++
++      if (smp_num_siblings <= 1)
++              goto out;
++
++      if (smp_num_siblings > nr_cpu_ids) {
++              pr_warning("CPU: Unsupported number of siblings %d",
++                         smp_num_siblings);
++              smp_num_siblings = 1;
++              return;
++      }
++
++      index_msb = get_count_order(smp_num_siblings);
++      c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
++
++      smp_num_siblings = smp_num_siblings / c->x86_max_cores;
++
++      index_msb = get_count_order(smp_num_siblings);
++
++      core_bits = get_count_order(c->x86_max_cores);
++
++      c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
++                                     ((1 << core_bits) - 1);
++
++out:
++      if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
++              printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
++                     c->phys_proc_id);
++              printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
++                     c->cpu_core_id);
++              printed = 1;
++      }
++#endif
++}
++
++static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
++{
++      char *v = c->x86_vendor_id;
++      int i;
++
++      for (i = 0; i < X86_VENDOR_NUM; i++) {
++              if (!cpu_devs[i])
++                      break;
++
++              if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
++                  (cpu_devs[i]->c_ident[1] &&
++                   !strcmp(v, cpu_devs[i]->c_ident[1]))) {
++
++                      this_cpu = cpu_devs[i];
++                      c->x86_vendor = this_cpu->c_x86_vendor;
++                      return;
++              }
++      }
++
++      printk_once(KERN_ERR
++                      "CPU: vendor_id '%s' unknown, using generic init.\n" \
++                      "CPU: Your system may be unstable.\n", v);
++
++      c->x86_vendor = X86_VENDOR_UNKNOWN;
++      this_cpu = &default_cpu;
++}
++
++void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
++{
++      /* Get vendor name */
++      cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
++            (unsigned int *)&c->x86_vendor_id[0],
++            (unsigned int *)&c->x86_vendor_id[8],
++            (unsigned int *)&c->x86_vendor_id[4]);
++
++      c->x86 = 4;
++      /* Intel-defined flags: level 0x00000001 */
++      if (c->cpuid_level >= 0x00000001) {
++              u32 junk, tfms, cap0, misc;
++
++              cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
++              c->x86 = (tfms >> 8) & 0xf;
++              c->x86_model = (tfms >> 4) & 0xf;
++              c->x86_mask = tfms & 0xf;
++
++              if (c->x86 == 0xf)
++                      c->x86 += (tfms >> 20) & 0xff;
++              if (c->x86 >= 0x6)
++                      c->x86_model += ((tfms >> 16) & 0xf) << 4;
++
++              if (cap0 & (1<<19)) {
++                      c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
++                      c->x86_cache_alignment = c->x86_clflush_size;
++              }
++      }
++}
++
++void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
++{
++      u32 tfms, xlvl;
++      u32 ebx;
++
++      /* Intel-defined flags: level 0x00000001 */
++      if (c->cpuid_level >= 0x00000001) {
++              u32 capability, excap;
++
++              cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
++              c->x86_capability[0] = capability;
++              c->x86_capability[4] = excap;
++      }
++
++      /* Additional Intel-defined flags: level 0x00000007 */
++      if (c->cpuid_level >= 0x00000007) {
++              u32 eax, ebx, ecx, edx;
++
++              cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
++
++              if (eax > 0)
++                      c->x86_capability[9] = ebx;
++      }
++
++      /* AMD-defined flags: level 0x80000001 */
++      xlvl = cpuid_eax(0x80000000);
++      c->extended_cpuid_level = xlvl;
++
++      if ((xlvl & 0xffff0000) == 0x80000000) {
++              if (xlvl >= 0x80000001) {
++                      c->x86_capability[1] = cpuid_edx(0x80000001);
++                      c->x86_capability[6] = cpuid_ecx(0x80000001);
++              }
++      }
++
++      if (c->extended_cpuid_level >= 0x80000008) {
++              u32 eax = cpuid_eax(0x80000008);
++
++              c->x86_virt_bits = (eax >> 8) & 0xff;
++              c->x86_phys_bits = eax & 0xff;
++      }
++#ifdef CONFIG_X86_32
++      else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
++              c->x86_phys_bits = 36;
++#endif
++
++      if (c->extended_cpuid_level >= 0x80000007)
++              c->x86_power = cpuid_edx(0x80000007);
++
++      init_scattered_cpuid_features(c);
++}
++
++static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
++{
++#ifdef CONFIG_X86_32
++      int i;
++
++      /*
++       * First of all, decide if this is a 486 or higher
++       * It's a 486 if we can modify the AC flag
++       */
++      if (flag_is_changeable_p(X86_EFLAGS_AC))
++              c->x86 = 4;
++      else
++              c->x86 = 3;
++
++      for (i = 0; i < X86_VENDOR_NUM; i++)
++              if (cpu_devs[i] && cpu_devs[i]->c_identify) {
++                      c->x86_vendor_id[0] = 0;
++                      cpu_devs[i]->c_identify(c);
++                      if (c->x86_vendor_id[0]) {
++                              get_cpu_vendor(c);
++                              break;
++                      }
++              }
++#endif
++}
++
++/*
++ * Do minimum CPU detection early.
++ * Fields really needed: vendor, cpuid_level, family, model, mask,
++ * cache alignment.
++ * The others are not touched to avoid unwanted side effects.
++ *
++ * WARNING: this function is only called on the BP.  Don't add code here
++ * that is supposed to run on all CPUs.
++ */
++static void __init early_identify_cpu(struct cpuinfo_x86 *c)
++{
++#ifdef CONFIG_X86_64
++      c->x86_clflush_size = 64;
++      c->x86_phys_bits = 36;
++      c->x86_virt_bits = 48;
++#else
++      c->x86_clflush_size = 32;
++      c->x86_phys_bits = 32;
++      c->x86_virt_bits = 32;
++#endif
++      c->x86_cache_alignment = c->x86_clflush_size;
++
++      memset(&c->x86_capability, 0, sizeof c->x86_capability);
++      c->extended_cpuid_level = 0;
++
++      if (!have_cpuid_p())
++              identify_cpu_without_cpuid(c);
++
++      /* cyrix could have cpuid enabled via c_identify()*/
++      if (!have_cpuid_p())
++              return;
++
++      cpu_detect(c);
++
++      get_cpu_vendor(c);
++
++      get_cpu_cap(c);
++#ifdef CONFIG_XEN
++      if (!cpu_has_xsave)
++              x86_xsave_setup(NULL);
++#endif
++
++      if (this_cpu->c_early_init)
++              this_cpu->c_early_init(c);
++
++#ifdef CONFIG_SMP
++      c->cpu_index = 0;
++#endif
++      filter_cpuid_features(c, false);
++}
++
++void __init early_cpu_init(void)
++{
++      const struct cpu_dev *const *cdev;
++      int count = 0;
++
++#ifdef CONFIG_PROCESSOR_SELECT
++      printk(KERN_INFO "KERNEL supported cpus:\n");
++#endif
++
++      for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
++              const struct cpu_dev *cpudev = *cdev;
++
++              if (count >= X86_VENDOR_NUM)
++                      break;
++              cpu_devs[count] = cpudev;
++              count++;
++
++#ifdef CONFIG_PROCESSOR_SELECT
++              {
++                      unsigned int j;
++
++                      for (j = 0; j < 2; j++) {
++                              if (!cpudev->c_ident[j])
++                                      continue;
++                              printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
++                                      cpudev->c_ident[j]);
++                      }
++              }
++#endif
++      }
++      early_identify_cpu(&boot_cpu_data);
++}
++
++/*
++ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
++ * unfortunately, that's not true in practice because of early VIA
++ * chips and (more importantly) broken virtualizers that are not easy
++ * to detect. In the latter case it doesn't even *fail* reliably, so
++ * probing for it doesn't even work. Disable it completely on 32-bit
++ * unless we can find a reliable way to detect all the broken cases.
++ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
++ */
++static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
++{
++#ifdef CONFIG_X86_32
++      clear_cpu_cap(c, X86_FEATURE_NOPL);
++#else
++      set_cpu_cap(c, X86_FEATURE_NOPL);
++#endif
++}
++
++static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
++{
++      c->extended_cpuid_level = 0;
++
++      if (!have_cpuid_p())
++              identify_cpu_without_cpuid(c);
++
++      /* cyrix could have cpuid enabled via c_identify()*/
++      if (!have_cpuid_p())
++              return;
++
++      cpu_detect(c);
++
++      get_cpu_vendor(c);
++
++      get_cpu_cap(c);
++
++#ifndef CONFIG_XEN
++      if (c->cpuid_level >= 0x00000001) {
++              c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
++#ifdef CONFIG_X86_32
++# ifdef CONFIG_X86_HT
++              c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
++# else
++              c->apicid = c->initial_apicid;
++# endif
++#endif
++
++#ifdef CONFIG_X86_HT
++              c->phys_proc_id = c->initial_apicid;
++#endif
++      }
++#endif
++
++      get_model_name(c); /* Default name */
++
++      detect_nopl(c);
++}
++
++/*
++ * This does the hard work of actually picking apart the CPU stuff...
++ */
++static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
++{
++      int i;
++
++      c->loops_per_jiffy = loops_per_jiffy;
++      c->x86_cache_size = -1;
++      c->x86_vendor = X86_VENDOR_UNKNOWN;
++      c->x86_model = c->x86_mask = 0; /* So far unknown... */
++      c->x86_vendor_id[0] = '\0'; /* Unset */
++      c->x86_model_id[0] = '\0';  /* Unset */
++#ifndef CONFIG_XEN
++      c->x86_max_cores = 1;
++      c->x86_coreid_bits = 0;
++#endif
++#ifdef CONFIG_X86_64
++      c->x86_clflush_size = 64;
++      c->x86_phys_bits = 36;
++      c->x86_virt_bits = 48;
++#else
++      c->cpuid_level = -1;    /* CPUID not detected */
++      c->x86_clflush_size = 32;
++      c->x86_phys_bits = 32;
++      c->x86_virt_bits = 32;
++#endif
++      c->x86_cache_alignment = c->x86_clflush_size;
++      memset(&c->x86_capability, 0, sizeof c->x86_capability);
++      if (boot_cpu_has(X86_FEATURE_SYSCALL32))
++              set_cpu_cap(c, X86_FEATURE_SYSCALL32);
++
++      generic_identify(c);
++
++      if (this_cpu->c_identify)
++              this_cpu->c_identify(c);
++
++      /* Clear/Set all flags overriden by options, after probe */
++      for (i = 0; i < NCAPINTS; i++) {
++              c->x86_capability[i] &= ~cpu_caps_cleared[i];
++              c->x86_capability[i] |= cpu_caps_set[i];
++      }
++
++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
++      c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
++#endif
++
++      /*
++       * Vendor-specific initialization.  In this section we
++       * canonicalize the feature flags, meaning if there are
++       * features a certain CPU supports which CPUID doesn't
++       * tell us, CPUID claiming incorrect flags, or other bugs,
++       * we handle them here.
++       *
++       * At the end of this section, c->x86_capability better
++       * indicate the features this CPU genuinely supports!
++       */
++      if (this_cpu->c_init)
++              this_cpu->c_init(c);
++
++      /* Disable the PN if appropriate */
++      squash_the_stupid_serial_number(c);
++
++      /*
++       * The vendor-specific functions might have changed features.
++       * Now we do "generic changes."
++       */
++
++      /* Filter out anything that depends on CPUID levels we don't have */
++      filter_cpuid_features(c, true);
++
++      /* If the model name is still unset, do table lookup. */
++      if (!c->x86_model_id[0]) {
++              const char *p;
++              p = table_lookup_model(c);
++              if (p)
++                      strcpy(c->x86_model_id, p);
++              else
++                      /* Last resort... */
++                      sprintf(c->x86_model_id, "%02x/%02x",
++                              c->x86, c->x86_model);
++      }
++
++#ifdef CONFIG_X86_64
++      detect_ht(c);
++#endif
++
++      init_hypervisor(c);
++
++      /*
++       * Clear/Set all flags overriden by options, need do it
++       * before following smp all cpus cap AND.
++       */
++      for (i = 0; i < NCAPINTS; i++) {
++              c->x86_capability[i] &= ~cpu_caps_cleared[i];
++              c->x86_capability[i] |= cpu_caps_set[i];
++      }
++
++      /*
++       * On SMP, boot_cpu_data holds the common feature set between
++       * all CPUs; so make sure that we indicate which features are
++       * common between the CPUs.  The first time this routine gets
++       * executed, c == &boot_cpu_data.
++       */
++      if (c != &boot_cpu_data) {
++              /* AND the already accumulated flags with these */
++              for (i = 0; i < NCAPINTS; i++)
++                      boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
++      }
++
++      /* Init Machine Check Exception if available. */
++      mcheck_cpu_init(c);
++
++      select_idle_routine(c);
++
++#ifdef CONFIG_NUMA
++      numa_add_cpu(smp_processor_id());
++#endif
++}
++
++#ifdef CONFIG_X86_64
++static void vgetcpu_set_mode(void)
++{
++      if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
++              vgetcpu_mode = VGETCPU_RDTSCP;
++      else
++              vgetcpu_mode = VGETCPU_LSL;
++}
++#endif
++
++void __init identify_boot_cpu(void)
++{
++      identify_cpu(&boot_cpu_data);
++      init_c1e_mask();
++#ifdef CONFIG_X86_32
++      sysenter_setup();
++      enable_sep_cpu();
++#else
++      vgetcpu_set_mode();
++#endif
++}
++
++#ifdef CONFIG_XEN
++void set_perf_event_pending(void) {}
++#endif
++
++void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
++{
++      BUG_ON(c == &boot_cpu_data);
++      identify_cpu(c);
++#ifdef CONFIG_X86_32
++      enable_sep_cpu();
++#endif
++      mtrr_ap_init();
++}
++
++struct msr_range {
++      unsigned        min;
++      unsigned        max;
++};
++
++static const struct msr_range msr_range_array[] __cpuinitconst = {
++      { 0x00000000, 0x00000418},
++      { 0xc0000000, 0xc000040b},
++      { 0xc0010000, 0xc0010142},
++      { 0xc0011000, 0xc001103b},
++};
++
++static void __cpuinit print_cpu_msr(void)
++{
++      unsigned index_min, index_max;
++      unsigned index;
++      u64 val;
++      int i;
++
++      for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
++              index_min = msr_range_array[i].min;
++              index_max = msr_range_array[i].max;
++
++              for (index = index_min; index < index_max; index++) {
++                      if (rdmsrl_amd_safe(index, &val))
++                              continue;
++                      printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
++              }
++      }
++}
++
++static int show_msr __cpuinitdata;
++
++static __init int setup_show_msr(char *arg)
++{
++      int num;
++
++      get_option(&arg, &num);
++
++      if (num > 0)
++              show_msr = num;
++      return 1;
++}
++__setup("show_msr=", setup_show_msr);
++
++static __init int setup_noclflush(char *arg)
++{
++      setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
++      return 1;
++}
++__setup("noclflush", setup_noclflush);
++
++void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
++{
++      const char *vendor = NULL;
++
++      if (c->x86_vendor < X86_VENDOR_NUM) {
++              vendor = this_cpu->c_vendor;
++      } else {
++              if (c->cpuid_level >= 0)
++                      vendor = c->x86_vendor_id;
++      }
++
++      if (vendor && !strstr(c->x86_model_id, vendor))
++              printk(KERN_CONT "%s ", vendor);
++
++      if (c->x86_model_id[0])
++              printk(KERN_CONT "%s", c->x86_model_id);
++      else
++              printk(KERN_CONT "%d86", c->x86);
++
++      if (c->x86_mask || c->cpuid_level >= 0)
++              printk(KERN_CONT " stepping %02x\n", c->x86_mask);
++      else
++              printk(KERN_CONT "\n");
++
++#ifdef CONFIG_SMP
++      if (c->cpu_index < show_msr)
++              print_cpu_msr();
++#else
++      if (show_msr)
++              print_cpu_msr();
++#endif
++}
++
++static __init int setup_disablecpuid(char *arg)
++{
++      int bit;
++
++      if (get_option(&arg, &bit) && bit < NCAPINTS*32)
++              setup_clear_cpu_cap(bit);
++      else
++              return 0;
++
++      return 1;
++}
++__setup("clearcpuid=", setup_disablecpuid);
++
++#ifdef CONFIG_X86_64
++#ifndef CONFIG_X86_NO_IDT
++struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
++#endif
++
++DEFINE_PER_CPU_FIRST(union irq_stack_union,
++                   irq_stack_union) __aligned(PAGE_SIZE);
++
++void xen_switch_pt(void)
++{
++#ifdef CONFIG_XEN
++      xen_pt_switch(init_level4_pgt);
++#endif
++}
++
++/*
++ * The following four percpu variables are hot.  Align current_task to
++ * cacheline size such that all four fall in the same cacheline.
++ */
++DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
++      &init_task;
++EXPORT_PER_CPU_SYMBOL(current_task);
++
++DEFINE_PER_CPU(unsigned long, kernel_stack) =
++      (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
++EXPORT_PER_CPU_SYMBOL(kernel_stack);
++
++DEFINE_PER_CPU(char *, irq_stack_ptr) =
++      init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
++
++DEFINE_PER_CPU(unsigned int, irq_count) = -1;
++
++#ifndef CONFIG_X86_NO_TSS
++/*
++ * Special IST stacks which the CPU switches to when it calls
++ * an IST-marked descriptor entry. Up to 7 stacks (hardware
++ * limit), all of them are 4K, except the debug stack which
++ * is 8K.
++ */
++static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
++        [0 ... N_EXCEPTION_STACKS - 1]        = EXCEPTION_STKSZ,
++        [DEBUG_STACK - 1]                     = DEBUG_STKSZ
++};
++
++static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
++      [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
++#endif
++
++void __cpuinit syscall_init(void)
++{
++#ifndef CONFIG_XEN
++      /*
++       * LSTAR and STAR live in a bit strange symbiosis.
++       * They both write to the same internal register. STAR allows to
++       * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
++       */
++      wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
++      wrmsrl(MSR_LSTAR, system_call);
++      wrmsrl(MSR_CSTAR, ignore_sysret);
++#endif
++
++#ifdef CONFIG_IA32_EMULATION
++      syscall32_cpu_init();
++#elif defined(CONFIG_XEN)
++      static const struct callback_register __cpuinitconst cstar = {
++              .type = CALLBACKTYPE_syscall32,
++              .address = (unsigned long)ignore_sysret
++      };
++
++      if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
++              printk(KERN_WARNING "Unable to register CSTAR callback\n");
++#endif
++
++#ifndef CONFIG_XEN
++      /* Flags to clear on syscall */
++      wrmsrl(MSR_SYSCALL_MASK,
++             X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
++#endif
++}
++
++unsigned long kernel_eflags;
++
++#ifndef CONFIG_X86_NO_TSS
++/*
++ * Copies of the original ist values from the tss are only accessed during
++ * debugging, no special alignment required.
++ */
++DEFINE_PER_CPU(struct orig_ist, orig_ist);
++#endif
++
++#else /* CONFIG_X86_64 */
++
++DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
++EXPORT_PER_CPU_SYMBOL(current_task);
++
++#ifdef CONFIG_CC_STACKPROTECTOR
++DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
++#endif
++
++/* Make sure %fs and %gs are initialized properly in idle threads */
++struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
++{
++      memset(regs, 0, sizeof(struct pt_regs));
++      regs->fs = __KERNEL_PERCPU;
++      regs->gs = __KERNEL_STACK_CANARY;
++
++      return regs;
++}
++#endif        /* CONFIG_X86_64 */
++
++/*
++ * Clear all 6 debug registers:
++ */
++static void clear_all_debug_regs(void)
++{
++      int i;
++
++      for (i = 0; i < 8; i++) {
++              /* Ignore db4, db5 */
++              if ((i == 4) || (i == 5))
++                      continue;
++
++              set_debugreg(0, i);
++      }
++}
++
++#ifdef CONFIG_KGDB
++/*
++ * Restore debug regs if using kgdbwait and you have a kernel debugger
++ * connection established.
++ */
++static void dbg_restore_debug_regs(void)
++{
++      if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
++              arch_kgdb_ops.correct_hw_break();
++}
++#else /* ! CONFIG_KGDB */
++#define dbg_restore_debug_regs()
++#endif /* ! CONFIG_KGDB */
++
++/*
++ * cpu_init() initializes state that is per-CPU. Some data is already
++ * initialized (naturally) in the bootstrap process, such as the GDT
++ * and IDT. We reload them nevertheless, this function acts as a
++ * 'CPU state barrier', nothing should get across.
++ * A lot of state is already set up in PDA init for 64 bit
++ */
++#ifdef CONFIG_X86_64
++
++void __cpuinit cpu_init(void)
++{
++#ifndef CONFIG_X86_NO_TSS
++      struct orig_ist *oist;
++      struct tss_struct *t;
++      unsigned long v;
++      int i;
++#endif
++      struct task_struct *me;
++      int cpu;
++
++      cpu = stack_smp_processor_id();
++      /* CPU 0 is initialised in head64.c */
++      if (cpu != 0)
++              xen_switch_pt();
++#ifndef CONFIG_X86_NO_TSS
++      t = &per_cpu(init_tss, cpu);
++      oist = &per_cpu(orig_ist, cpu);
++#endif
++
++#ifdef CONFIG_NUMA
++      if (cpu != 0 && percpu_read(numa_node) == 0 &&
++          early_cpu_to_node(cpu) != NUMA_NO_NODE)
++              set_numa_node(early_cpu_to_node(cpu));
++#endif
++
++      me = current;
++
++      if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
++              panic("CPU#%d already initialized!\n", cpu);
++
++      pr_debug("Initializing CPU#%d\n", cpu);
++
++      clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
++
++      /*
++       * Initialize the per-CPU GDT with the boot GDT,
++       * and set up the GDT descriptor:
++       */
++
++      switch_to_new_gdt(cpu);
++      loadsegment(fs, 0);
++
++#ifndef CONFIG_X86_NO_IDT
++      load_idt((const struct desc_ptr *)&idt_descr);
++#endif
++
++      memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
++      syscall_init();
++
++      wrmsrl(MSR_FS_BASE, 0);
++      wrmsrl(MSR_KERNEL_GS_BASE, 0);
++      barrier();
++
++      x86_configure_nx();
++#ifdef CONFIG_X86_LOCAL_APIC
++      if (cpu != 0)
++              enable_x2apic();
++#endif
++
++#ifndef CONFIG_X86_NO_TSS
++      /*
++       * set up and load the per-CPU TSS
++       */
++      if (!oist->ist[0]) {
++              char *estacks = per_cpu(exception_stacks, cpu);
++
++              for (v = 0; v < N_EXCEPTION_STACKS; v++) {
++                      estacks += exception_stack_sizes[v];
++                      oist->ist[v] = t->x86_tss.ist[v] =
++                                      (unsigned long)estacks;
++              }
++      }
++
++      t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
++
++      /*
++       * <= is required because the CPU will access up to
++       * 8 bits beyond the end of the IO permission bitmap.
++       */
++      for (i = 0; i <= IO_BITMAP_LONGS; i++)
++              t->io_bitmap[i] = ~0UL;
++#endif
++
++      atomic_inc(&init_mm.mm_count);
++      me->active_mm = &init_mm;
++      BUG_ON(me->mm);
++      enter_lazy_tlb(&init_mm, me);
++
++      load_sp0(t, &current->thread);
++#ifndef CONFIG_X86_NO_TSS
++      set_tss_desc(cpu, t);
++      load_TR_desc();
++#endif
++      load_LDT(&init_mm.context);
++
++      clear_all_debug_regs();
++      dbg_restore_debug_regs();
++
++      fpu_init();
++      xsave_init();
++
++#ifndef CONFIG_XEN
++      raw_local_save_flags(kernel_eflags);
++#else
++      asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
++      if (raw_irqs_disabled())
++              kernel_eflags &= ~X86_EFLAGS_IF;
++#endif
++
++#ifdef CONFIG_X86_LOCAL_APIC
++      if (is_uv_system())
++              uv_cpu_init();
++#endif
++}
++
++#else
++
++void __cpuinit cpu_init(void)
++{
++      int cpu = smp_processor_id();
++      struct task_struct *curr = current;
++#ifndef CONFIG_X86_NO_TSS
++      struct tss_struct *t = &per_cpu(init_tss, cpu);
++#endif
++      struct thread_struct *thread = &curr->thread;
++
++      if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
++              printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
++              for (;;)
++                      local_irq_enable();
++      }
++
++      printk(KERN_INFO "Initializing CPU#%d\n", cpu);
++
++      if (cpu_has_vme || cpu_has_de)
++              clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
++
++      switch_to_new_gdt(cpu);
++
++      /*
++       * Set up and load the per-CPU TSS and LDT
++       */
++      atomic_inc(&init_mm.mm_count);
++      curr->active_mm = &init_mm;
++      BUG_ON(curr->mm);
++      enter_lazy_tlb(&init_mm, curr);
++
++      load_sp0(t, thread);
++
++      load_LDT(&init_mm.context);
++
++#ifndef CONFIG_X86_NO_TSS
++      t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
++#endif
++
++#ifdef CONFIG_DOUBLEFAULT
++      /* Set up doublefault TSS pointer in the GDT */
++      __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
++#endif
++
++      clear_all_debug_regs();
++      dbg_restore_debug_regs();
++
++      fpu_init();
++      xsave_init();
++}
++#endif
diff --cc arch/x86/kernel/cpu/intel.c

index 1edf5ba,df86bc8..9bd1a4d
--- 1/arch/x86/kernel/cpu/intel.c
--- 2/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@@ -36,10 -36,10 +36,15 @@@ static void __cpuinit early_init_intel(
                 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
   
                 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
++#ifndef CONFIG_XEN
                         misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
                         wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
                         c->cpuid_level = cpuid_eax(0);
                         get_cpu_cap(c);
++#else
++                      pr_warning("CPUID levels are restricted -"
++                                 " update hypervisor\n");
++#endif
                 }
         }
   
@@@ -55,6 -55,6 +60,9 @@@
          * need the microcode to have already been loaded... so if it is
          * not, recommend a BIOS update and disable large pages.
          */
++#ifdef CONFIG_XEN
++      if (cpu_has(c, X86_FEATURE_PSE))
++#endif
         if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) {
                 u32 ucode, junk;
   
@@@ -91,8 -91,8 +99,10 @@@
         if (c->x86_power & (1 << 8)) {
                 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
                 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
++#ifndef CONFIG_XEN
                 if (!check_tsc_unstable())
                         sched_clock_stable = 1;
++#endif
         }
   
         /*
@@@ -238,9 -227,9 +237,13 @@@ static void __cpuinit intel_workarounds
                 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
                 if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
                         printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
++#ifndef CONFIG_XEN
                         printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
                         lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
                         wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
++#else
++                      pr_warning("CPU: Hypervisor update needed\n");
++#endif
                 }
         }
   
@@@ -285,6 -274,6 +288,7 @@@ static void __cpuinit intel_workarounds
   }
   #endif
   
++#ifndef CONFIG_XEN
   static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
   {
   #ifdef CONFIG_NUMA
@@@ -357,6 -346,6 +361,7 @@@ static void __cpuinit detect_vmx_virtca
                         set_cpu_cap(c, X86_FEATURE_VPID);
         }
   }
++#endif
   
   static void __cpuinit init_intel(struct cpuinfo_x86 *c)
   {
@@@ -440,6 -431,6 +447,7 @@@
                 set_cpu_cap(c, X86_FEATURE_P3);
   #endif
   
++#ifndef CONFIG_XEN
         if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
                 /*
                  * let's use the legacy cpuid vector 0x1 and 0x4 for topology
@@@ -456,6 -447,6 +464,7 @@@
   
         if (cpu_has(c, X86_FEATURE_VMX))
                 detect_vmx_virtcap(c);
++#endif
   }
   
   #ifdef CONFIG_X86_32
diff --cc arch/x86/kernel/cpu/intel_cacheinfo.c

index c105c53,1ce1af2..61af0f2
--- 1/arch/x86/kernel/cpu/intel_cacheinfo.c
--- 2/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@@ -290,8 -290,8 +290,9 @@@ amd_cpuid4(int leaf, union _cpuid4_leaf
         eax->split.type = types[leaf];
         eax->split.level = levels[leaf];
         eax->split.num_threads_sharing = 0;
++#ifndef CONFIG_XEN
         eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
--
++#endif
   
         if (assoc == 0xffff)
                 eax->split.is_fully_associative = 1;
@@@ -309,7 -309,7 +310,7 @@@ struct _cache_attr 
                          unsigned int);
   };
   
--#ifdef CONFIG_AMD_NB
++#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN)
   
   /*
    * L3 cache descriptors
@@@ -602,8 -614,8 +615,8 @@@ unsigned int __cpuinit init_intel_cache
         unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
         unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
         unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
--      unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
   #ifdef CONFIG_X86_HT
++      unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
         unsigned int cpu = c->cpu_index;
   #endif
   
@@@ -637,16 -649,16 +650,20 @@@
                                         break;
                                 case 2:
                                         new_l2 = this_leaf.size/1024;
++#ifdef CONFIG_X86_HT
                                         num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
                                         index_msb = get_count_order(num_threads_sharing);
                                         l2_id = c->apicid >> index_msb;
++#endif
                                         break;
                                 case 3:
                                         new_l3 = this_leaf.size/1024;
++#ifdef CONFIG_X86_HT
                                         num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
                                         index_msb = get_count_order(
                                                         num_threads_sharing);
                                         l3_id = c->apicid >> index_msb;
++#endif
                                         break;
                                 default:
                                         break;
@@@ -747,7 -759,7 +764,7 @@@
   static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
   #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
   
--#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
   static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
   {
         struct _cpuid4_info     *this_leaf, *sibling_leaf;
@@@ -988,7 -1000,7 +1005,7 @@@ static struct attribute *default_attrs[
         NULL
   };
   
--#ifdef CONFIG_AMD_NB
++#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN)
   static struct attribute ** __cpuinit amd_l3_attrs(void)
   {
         static struct attribute **attrs;
@@@ -1134,7 -1146,7 +1151,7 @@@ static int __cpuinit cache_add_dev(stru
                 this_leaf = CPUID4_INFO_IDX(cpu, i);
   
                 ktype_cache.default_attrs = default_attrs;
--#ifdef CONFIG_AMD_NB
++#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN)
                 if (this_leaf->l3)
                         ktype_cache.default_attrs = amd_l3_attrs();
   #endif
diff --cc arch/x86/kernel/cpu/mcheck/Makefile

index bb34b03,bb34b03..21e0a8a
--- 1/arch/x86/kernel/cpu/mcheck/Makefile
--- 2/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@@ -3,6 -3,6 +3,7 @@@ obj-y                            =  mce.o mce-severity.
   obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
   obj-$(CONFIG_X86_MCE_INTEL)   += mce_intel.o
   obj-$(CONFIG_X86_MCE_AMD)     += mce_amd.o
++obj-$(CONFIG_X86_XEN_MCE)     += mce_dom0.o
   obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
   obj-$(CONFIG_X86_MCE_INJECT)  += mce-inject.o
   
diff --cc arch/x86/kernel/cpu/mcheck/mce-inject.c

index 0ed633c,0ed633c..589e381
--- 1/arch/x86/kernel/cpu/mcheck/mce-inject.c
--- 2/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@@ -145,7 -145,7 +145,7 @@@ static void raise_mce(struct mce *m
         if (context == MCJ_CTX_RANDOM)
                 return;
   
--#ifdef CONFIG_X86_LOCAL_APIC
++#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
         if (m->inject_flags & MCJ_NMI_BROADCAST) {
                 unsigned long start;
                 int cpu;
diff --cc arch/x86/kernel/cpu/mcheck/mce.c

index ff1ae9b,3385ea2..70e4c87
--- 1/arch/x86/kernel/cpu/mcheck/mce.c
--- 2/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@@ -122,10 -136,10 +136,12 @@@ void mce_setup(struct mce *m
         m->time = get_seconds();
         m->cpuvendor = boot_cpu_data.x86_vendor;
         m->cpuid = cpuid_eax(1);
++#ifndef CONFIG_XEN
   #ifdef CONFIG_SMP
         m->socketid = cpu_data(m->extcpu).phys_proc_id;
   #endif
         m->apicid = cpu_data(m->extcpu).initial_apicid;
++#endif
         rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
   }
   
@@@ -469,7 -477,7 +479,9 @@@ static inline void mce_get_rip(struct m
    */
   asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
   {
++#ifndef CONFIG_XEN
         ack_APIC_irq();
++#endif
         exit_idle();
         irq_enter();
         mce_notify_irq();
@@@ -492,7 -500,7 +504,7 @@@ static void mce_report_event(struct pt_
                 return;
         }
   
--#ifdef CONFIG_X86_LOCAL_APIC
++#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
         /*
          * Without APIC do not notify. The event will be picked
          * up eventually.
@@@ -1138,8 -1147,8 +1151,15 @@@ void mce_log_therm_throt_event(__u64 st
    * Periodic polling timer for "silent" machine check errors.  If the
    * poller finds an MCE, poll 2x faster.  When the poller finds no more
    * errors, poll 2x slower (up to check_interval seconds).
++ *
++ * We will disable polling in DOM0 since all CMCI/Polling
++ * mechanism will be done in XEN for Intel CPUs
    */
++#if defined (CONFIG_X86_XEN_MCE)
++static int check_interval = 0; /* disable polling */
++#else
   static int check_interval = 5 * 60; /* 5 minutes */
++#endif
   
   static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
   static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@@ -1304,6 -1313,6 +1324,7 @@@ static int __cpuinit __mcheck_cpu_apply
   
         /* This should be disabled by the BIOS, but isn't always */
         if (c->x86_vendor == X86_VENDOR_AMD) {
++#ifndef CONFIG_XEN
                 if (c->x86 == 15 && banks > 4) {
                         /*
                          * disable GART TBL walk error reporting, which
@@@ -1312,6 -1321,6 +1333,7 @@@
                          */
                         clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
                 }
++#endif
                 if (c->x86 <= 17 && mce_bootlog < 0) {
                         /*
                          * Lots of broken BIOS around that don't clear them
@@@ -1379,6 -1388,6 +1401,7 @@@ static void __cpuinit __mcheck_cpu_anci
   
   static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
   {
++#ifndef CONFIG_X86_64_XEN
         switch (c->x86_vendor) {
         case X86_VENDOR_INTEL:
                 mce_intel_feature_init(c);
@@@ -1389,6 -1398,6 +1412,7 @@@
         default:
                 break;
         }
++#endif
   }
   
   static void __mcheck_cpu_init_timer(void)
@@@ -2134,6 -2145,6 +2160,16 @@@ static __init int mcheck_init_device(vo
         register_hotcpu_notifier(&mce_cpu_notifier);
         misc_register(&mce_log_device);
   
++#ifdef CONFIG_X86_XEN_MCE
++      if (is_initial_xendomain()) {
++              /* Register vIRQ handler for MCE LOG processing */
++              extern int bind_virq_for_mce(void);
++
++              printk(KERN_DEBUG "MCE: bind virq for DOM0 logging\n");
++              bind_virq_for_mce();
++      }
++#endif
++
         return err;
   }
   
diff --cc arch/x86/kernel/cpu/mcheck/mce_dom0.c

index 0000000,0000000..aecee08

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_dom0.c
@@@ -1,0 -1,0 +1,187 @@@
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/slab.h>
++#include <xen/interface/xen.h>
++#include <xen/evtchn.h>
++#include <xen/interface/vcpu.h>
++#include <asm/hypercall.h>
++#include <asm/mce.h>
++
++static xen_mc_logical_cpu_t *g_physinfo;
++static unsigned int ncpus;
++
++static int convert_log(struct mc_info *mi)
++{
++      struct mcinfo_common *mic = NULL;
++      struct mcinfo_global *mc_global;
++      struct mcinfo_bank *mc_bank;
++      struct mce m;
++      unsigned int i;
++      bool found = false;
++
++      x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
++      if (mic == NULL)
++      {
++              pr_err("DOM0_MCE_LOG: global data is NULL\n");
++              return -1;
++      }
++
++      mce_setup(&m);
++      mc_global = (struct mcinfo_global*)mic;
++      m.mcgstatus = mc_global->mc_gstatus;
++      m.apicid = mc_global->mc_apicid;
++
++      for (i = 0; i < ncpus; i++)
++              if (g_physinfo[i].mc_apicid == m.apicid) {
++                      found = true;
++                      break;
++              }
++      WARN_ON_ONCE(!found);
++      m.socketid = mc_global->mc_socketid;
++      m.cpu = m.extcpu = g_physinfo[i].mc_cpunr;
++      m.cpuvendor = (__u8)g_physinfo[i].mc_vendor;
++
++      x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK);
++      do
++      {
++              if (mic == NULL || mic->size == 0)
++                      break;
++              if (mic->type == MC_TYPE_BANK)
++              {
++                      mc_bank = (struct mcinfo_bank*)mic;
++                      m.misc = mc_bank->mc_misc;
++                      m.status = mc_bank->mc_status;
++                      m.addr = mc_bank->mc_addr;
++                      m.tsc = mc_bank->mc_tsc;
++                      m.bank = mc_bank->mc_bank;
++                      printk(KERN_DEBUG "[CPU%d, BANK%d, addr %llx, state %llx]\n", 
++                                              m.bank, m.cpu, m.addr, m.status);
++                      /*log this record*/
++                      mce_log(&m);
++              }
++              mic = x86_mcinfo_next(mic);
++      }while (1);
++
++      return 0;
++}
++
++static struct mc_info *g_mi;
++
++/*dom0 mce virq handler, logging physical mce error info*/
++
++static irqreturn_t mce_dom0_interrupt(int irq, void *dev_id)
++{
++      xen_mc_t mc_op;
++      int result = 0;
++
++      printk(KERN_DEBUG "MCE_DOM0_LOG: enter dom0 mce vIRQ handler\n");
++      mc_op.cmd = XEN_MC_fetch;
++      mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
++      set_xen_guest_handle(mc_op.u.mc_fetch.data, g_mi);
++urgent:
++      mc_op.u.mc_fetch.flags = XEN_MC_URGENT;
++      result = HYPERVISOR_mca(&mc_op);
++      if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
++                      mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
++      {
++              printk(KERN_DEBUG "MCE_DOM0_LOG: No more urgent data\n");
++              goto nonurgent;
++      }
++      else
++      {
++              result = convert_log(g_mi);
++              if (result) {
++                      pr_err("MCE_DOM0_LOG: Log conversion failed\n");
++                      goto end;
++              }
++              /* After fetching the telem from DOM0, we need to dec the telem's
++               * refcnt and release the entry. The telem is reserved and inc
++               * refcnt when filling the telem.
++               */
++              mc_op.u.mc_fetch.flags = XEN_MC_URGENT | XEN_MC_ACK;
++              result = HYPERVISOR_mca(&mc_op);
++
++              goto urgent;
++      }
++nonurgent:
++      mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT;
++      result = HYPERVISOR_mca(&mc_op);
++      if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
++                      mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
++      {
++              printk(KERN_DEBUG "MCE_DOM0_LOG: No more nonurgent data\n");
++              goto end;
++      }
++      else
++      {
++              result = convert_log(g_mi);
++              if (result) {
++                      pr_err("MCE_DOM0_LOG: Log conversion failed\n");
++                      goto end;
++              }
++              /* After fetching the telem from DOM0, we need to dec the telem's
++               * refcnt and release the entry. The telem is reserved and inc
++               * refcnt when filling the telem.
++               */
++              mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT | XEN_MC_ACK;
++              result = HYPERVISOR_mca(&mc_op);
++
++              goto nonurgent;
++      }
++end:
++      return IRQ_HANDLED;
++}
++
++int __init bind_virq_for_mce(void)
++{
++      int ret;
++      xen_mc_t mc_op;
++
++      g_mi = kmalloc(sizeof(*g_mi), GFP_KERNEL);
++      if (!g_mi)
++              return -ENOMEM;
++
++      /* fetch physical CPU count */
++      mc_op.cmd = XEN_MC_physcpuinfo;
++      mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
++      set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, NULL);
++      ret = HYPERVISOR_mca(&mc_op);
++      if (ret) {
++              pr_err("MCE: Failed to get physical CPU count\n");
++              kfree(g_mi);
++              return ret;
++      }
++
++      /* fetch CPU physical info for later reference */
++      ncpus = mc_op.u.mc_physcpuinfo.ncpus;
++      g_physinfo = kmalloc(sizeof(*g_physinfo) * ncpus, GFP_KERNEL);
++      if (!g_physinfo) {
++              kfree(g_mi);
++              return -ENOMEM;
++      }
++      set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
++      ret = HYPERVISOR_mca(&mc_op);
++      if (ret) {
++              pr_err("MCE: Failed to get physical CPUs' info\n");
++              kfree(g_mi);
++              kfree(g_physinfo);
++              return ret;
++      }
++
++      ret  = bind_virq_to_irqhandler(VIRQ_MCA, 0, 
++              mce_dom0_interrupt, 0, "mce", NULL);
++
++      if (ret < 0) {
++              pr_err("MCE: Failed to bind vIRQ for Dom0\n");
++              kfree(g_mi);
++              kfree(g_physinfo);
++              return ret;
++      }
++
++      /* Log the machine checks left over from the previous reset. */
++      mce_dom0_interrupt(VIRQ_MCA, NULL);
++
++      return 0;
++}
++
diff --cc arch/x86/kernel/cpu/mtrr/Makefile

index ad9e5ed,ad9e5ed..b854116
--- 1/arch/x86/kernel/cpu/mtrr/Makefile
--- 2/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@@ -1,3 -1,3 +1,4 @@@
   obj-y         := main.o if.o generic.o cleanup.o
   obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
   
++obj-$(CONFIG_XEN) := main.o if.o
diff --cc arch/x86/kernel/cpu/mtrr/main-xen.c

index 0000000,0000000..f58433f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
@@@ -1,0 -1,0 +1,324 @@@
++#define DEBUG
++
++#include <linux/uaccess.h>
++#include <linux/module.h>
++#include <linux/mutex.h>
++#include <linux/init.h>
++
++#include <asm/mtrr.h>
++#include "mtrr.h"
++
++static DEFINE_MUTEX(mtrr_mutex);
++
++void generic_get_mtrr(unsigned int reg, unsigned long *base,
++                    unsigned long *size, mtrr_type * type)
++{
++      struct xen_platform_op op;
++
++      op.cmd = XENPF_read_memtype;
++      op.u.read_memtype.reg = reg;
++      if (unlikely(HYPERVISOR_platform_op(&op)))
++              memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype));
++
++      *size = op.u.read_memtype.nr_mfns;
++      *base = op.u.read_memtype.mfn;
++      *type = op.u.read_memtype.type;
++}
++
++const struct mtrr_ops generic_mtrr_ops = {
++      .use_intel_if      = 1,
++      .get               = generic_get_mtrr,
++};
++
++const struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
++unsigned int num_var_ranges;
++unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
++
++static u64 tom2;
++
++static void __init set_num_var_ranges(void)
++{
++      struct xen_platform_op op;
++
++      for (num_var_ranges = 0; ; num_var_ranges++) {
++              op.cmd = XENPF_read_memtype;
++              op.u.read_memtype.reg = num_var_ranges;
++              if (HYPERVISOR_platform_op(&op) != 0)
++                      break;
++      }
++}
++
++static void __init init_table(void)
++{
++      int i, max;
++
++      max = num_var_ranges;
++      for (i = 0; i < max; i++)
++              mtrr_usage_table[i] = 0;
++}
++
++int mtrr_add_page(unsigned long base, unsigned long size,
++                unsigned int type, bool increment)
++{
++      int error;
++      struct xen_platform_op op;
++
++      mutex_lock(&mtrr_mutex);
++
++      op.cmd = XENPF_add_memtype;
++      op.u.add_memtype.mfn     = base;
++      op.u.add_memtype.nr_mfns = size;
++      op.u.add_memtype.type    = type;
++      error = HYPERVISOR_platform_op(&op);
++      if (error) {
++              mutex_unlock(&mtrr_mutex);
++              BUG_ON(error > 0);
++              return error;
++      }
++
++      if (increment)
++              ++mtrr_usage_table[op.u.add_memtype.reg];
++
++      mutex_unlock(&mtrr_mutex);
++
++      return op.u.add_memtype.reg;
++}
++
++static int mtrr_check(unsigned long base, unsigned long size)
++{
++      if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
++              pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
++              pr_debug("mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
++              dump_stack();
++              return -1;
++      }
++      return 0;
++}
++
++int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
++           bool increment)
++{
++      if (mtrr_check(base, size))
++              return -EINVAL;
++      return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
++                           increment);
++}
++EXPORT_SYMBOL(mtrr_add);
++
++int mtrr_del_page(int reg, unsigned long base, unsigned long size)
++{
++      unsigned i;
++      mtrr_type ltype;
++      unsigned long lbase, lsize;
++      int error = -EINVAL;
++      struct xen_platform_op op;
++
++      mutex_lock(&mtrr_mutex);
++
++      if (reg < 0) {
++              /*  Search for existing MTRR  */
++              for (i = 0; i < num_var_ranges; ++i) {
++                      mtrr_if->get(i, &lbase, &lsize, &ltype);
++                      if (lbase == base && lsize == size) {
++                              reg = i;
++                              break;
++                      }
++              }
++              if (reg < 0) {
++                      pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
++                               base, size);
++                      goto out;
++              }
++      }
++      if (mtrr_usage_table[reg] < 1) {
++              pr_warning("mtrr: reg: %d has count=0\n", reg);
++              goto out;
++      }
++      if (--mtrr_usage_table[reg] < 1) {
++              op.cmd = XENPF_del_memtype;
++              op.u.del_memtype.handle = 0;
++              op.u.del_memtype.reg    = reg;
++              error = HYPERVISOR_platform_op(&op);
++              if (error) {
++                      BUG_ON(error > 0);
++                      goto out;
++              }
++      }
++      error = reg;
++ out:
++      mutex_unlock(&mtrr_mutex);
++      return error;
++}
++
++int mtrr_del(int reg, unsigned long base, unsigned long size)
++{
++      if (mtrr_check(base, size))
++              return -EINVAL;
++      return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
++}
++EXPORT_SYMBOL(mtrr_del);
++
++/*
++ * Returns the effective MTRR type for the region
++ * Error returns:
++ * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
++ * - 0xFF - when MTRR is not enabled
++ */
++u8 mtrr_type_lookup(u64 start, u64 end)
++{
++      int i, error;
++      u64 start_mfn, end_mfn, base_mfn, top_mfn;
++      u8 prev_match, curr_match;
++      struct xen_platform_op op;
++
++      if (!is_initial_xendomain())
++              return MTRR_TYPE_WRBACK;
++
++      if (!num_var_ranges)
++              return 0xFF;
++
++      start_mfn = start >> PAGE_SHIFT;
++      /* Make end inclusive end, instead of exclusive */
++      end_mfn = --end >> PAGE_SHIFT;
++
++      /* Look in fixed ranges. Just return the type as per start */
++      if (start_mfn < 0x100) {
++#if 0//todo
++              op.cmd = XENPF_read_memtype;
++              op.u.read_memtype.reg = ???;
++              error = HYPERVISOR_platform_op(&op);
++              if (!error)
++                      return op.u.read_memtype.type;
++#endif
++              return MTRR_TYPE_UNCACHABLE;
++      }
++
++      /*
++       * Look in variable ranges
++       * Look of multiple ranges matching this address and pick type
++       * as per MTRR precedence
++       */
++      prev_match = 0xFF;
++      for (i = 0; i < num_var_ranges; ++i) {
++              op.cmd = XENPF_read_memtype;
++              op.u.read_memtype.reg = i;
++              error = HYPERVISOR_platform_op(&op);
++
++              if (error || !op.u.read_memtype.nr_mfns)
++                      continue;
++
++              base_mfn = op.u.read_memtype.mfn;
++              top_mfn = base_mfn + op.u.read_memtype.nr_mfns - 1;
++
++              if (base_mfn > end_mfn || start_mfn > top_mfn) {
++                      continue;
++              }
++
++              if (base_mfn > start_mfn || end_mfn > top_mfn) {
++                      return 0xFE;
++              }
++
++              curr_match = op.u.read_memtype.type;
++              if (prev_match == 0xFF) {
++                      prev_match = curr_match;
++                      continue;
++              }
++
++              if (prev_match == MTRR_TYPE_UNCACHABLE ||
++                  curr_match == MTRR_TYPE_UNCACHABLE) {
++                      return MTRR_TYPE_UNCACHABLE;
++              }
++
++              if ((prev_match == MTRR_TYPE_WRBACK &&
++                   curr_match == MTRR_TYPE_WRTHROUGH) ||
++                  (prev_match == MTRR_TYPE_WRTHROUGH &&
++                   curr_match == MTRR_TYPE_WRBACK)) {
++                      prev_match = MTRR_TYPE_WRTHROUGH;
++                      curr_match = MTRR_TYPE_WRTHROUGH;
++              }
++
++              if (prev_match != curr_match) {
++                      return MTRR_TYPE_UNCACHABLE;
++              }
++      }
++
++      if (tom2) {
++              if (start >= (1ULL<<32) && (end < tom2))
++                      return MTRR_TYPE_WRBACK;
++      }
++
++      if (prev_match != 0xFF)
++              return prev_match;
++
++#if 0//todo
++      op.cmd = XENPF_read_def_memtype;
++      error = HYPERVISOR_platform_op(&op);
++      if (!error)
++              return op.u.read_def_memtype.type;
++#endif
++      return MTRR_TYPE_UNCACHABLE;
++}
++
++/*
++ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
++ * for memory >4GB. Check for that here.
++ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
++ * apply to are wrong, but so far we don't know of any such case in the wild.
++ */
++#define Tom2Enabled (1U << 21)
++#define Tom2ForceMemTypeWB (1U << 22)
++
++int __init amd_special_default_mtrr(void)
++{
++      u32 l, h;
++
++      if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
++              return 0;
++      if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
++              return 0;
++      /* In case some hypervisor doesn't pass SYSCFG through */
++      if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
++              return 0;
++      /*
++       * Memory between 4GB and top of mem is forced WB by this magic bit.
++       * Reserved before K8RevF, but should be zero there.
++       */
++      if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
++               (Tom2Enabled | Tom2ForceMemTypeWB))
++              return 1;
++      return 0;
++}
++
++void __init mtrr_bp_init(void)
++{
++      if (amd_special_default_mtrr()) {
++              /* TOP_MEM2 */
++              rdmsrl(MSR_K8_TOP_MEM2, tom2);
++              tom2 &= 0xffffff8000000ULL;
++      }
++}
++
++void mtrr_ap_init(void)
++{
++}
++
++static int __init mtrr_init(void)
++{
++      struct cpuinfo_x86 *c = &boot_cpu_data;
++
++      if (!is_initial_xendomain())
++              return -ENODEV;
++
++      if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
++          (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
++          (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
++          (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
++              return -ENODEV;
++
++      set_num_var_ranges();
++      init_table();
++
++      return 0;
++}
++
++subsys_initcall(mtrr_init);
diff --cc arch/x86/kernel/cpu/proc.c

index 62ac8cb,62ac8cb..440ec21
--- 1/arch/x86/kernel/cpu/proc.c
--- 2/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@@ -10,7 -10,7 +10,7 @@@
   static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
                               unsigned int cpu)
   {
--#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
         if (c->x86_max_cores * smp_num_siblings > 1) {
                 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
                 seq_printf(m, "siblings\t: %d\n",
@@@ -32,18 -32,18 +32,22 @@@ static void show_cpuinfo_misc(struct se
          */
         int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
         seq_printf(m,
++#ifndef CONFIG_XEN
                    "fdiv_bug\t: %s\n"
                    "hlt_bug\t\t: %s\n"
                    "f00f_bug\t: %s\n"
                    "coma_bug\t: %s\n"
++#endif
                    "fpu\t\t: %s\n"
                    "fpu_exception\t: %s\n"
                    "cpuid level\t: %d\n"
                    "wp\t\t: %s\n",
++#ifndef CONFIG_XEN
                    c->fdiv_bug ? "yes" : "no",
                    c->hlt_works_ok ? "no" : "yes",
                    c->f00f_bug ? "yes" : "no",
                    c->coma_bug ? "yes" : "no",
++#endif
                    c->hard_math ? "yes" : "no",
                    fpu_exception ? "yes" : "no",
                    c->cpuid_level,
diff --cc arch/x86/kernel/cpu/scattered.c

index c7f64e6,c7f64e6..706fed6
--- 1/arch/x86/kernel/cpu/scattered.c
--- 2/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@@ -40,6 -40,6 +40,7 @@@ void __cpuinit init_scattered_cpuid_fea
                 { X86_FEATURE_EPB,              CR_ECX, 3, 0x00000006, 0 },
                 { X86_FEATURE_XSAVEOPT,         CR_EAX, 0, 0x0000000d, 1 },
                 { X86_FEATURE_CPB,              CR_EDX, 9, 0x80000007, 0 },
++#ifndef CONFIG_XEN
                 { X86_FEATURE_NPT,              CR_EDX, 0, 0x8000000a, 0 },
                 { X86_FEATURE_LBRV,             CR_EDX, 1, 0x8000000a, 0 },
                 { X86_FEATURE_SVML,             CR_EDX, 2, 0x8000000a, 0 },
@@@ -50,6 -50,6 +51,7 @@@
                 { X86_FEATURE_DECODEASSISTS,    CR_EDX, 7, 0x8000000a, 0 },
                 { X86_FEATURE_PAUSEFILTER,      CR_EDX,10, 0x8000000a, 0 },
                 { X86_FEATURE_PFTHRESHOLD,      CR_EDX,12, 0x8000000a, 0 },
++#endif
                 { 0, 0, 0, 0, 0 }
         };
   
diff --cc arch/x86/kernel/cpu/topology.c

index 4397e98,4397e98..dc581ec
--- 1/arch/x86/kernel/cpu/topology.c
--- 2/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@@ -28,7 -28,7 +28,7 @@@
    */
   void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
   {
--#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
         unsigned int eax, ebx, ecx, edx, sub_index;
         unsigned int ht_mask_width, core_plus_mask_width;
         unsigned int core_select_mask, core_level_siblings;
diff --cc arch/x86/kernel/dumpstack.c
Simple merge
diff --cc arch/x86/kernel/dumpstack_64.c

index 3ee362b,e71c98d..520ce38
--- 1/arch/x86/kernel/dumpstack_64.c
--- 2/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@@ -21,6 -20,6 +21,7 @@@
   #define N_EXCEPTION_STACKS_END \
                 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
   
++#ifndef CONFIG_X86_NO_TSS
   static char x86_stack_ids[][8] = {
                 [ DEBUG_STACK-1                 ]       = "#DB",
                 [ NMI_STACK-1                   ]       = "NMI",
@@@ -32,10 -31,10 +33,12 @@@
                   N_EXCEPTION_STACKS_END        ]       = "#DB[?]"
   #endif
   };
++#endif
   
   static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
                                          unsigned *usedp, char **idp)
   {
++#ifndef CONFIG_X86_NO_TSS
         unsigned k;
   
         /*
@@@ -95,6 -94,6 +98,7 @@@
                 }
   #endif
         }
++#endif /* CONFIG_X86_NO_TSS */
         return NULL;
   }
   
diff --cc arch/x86/kernel/e820-xen.c

index 0000000,0000000..0d783f9

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/e820-xen.c
@@@ -1,0 -1,0 +1,1289 @@@
++/*
++ * Handle the memory map.
++ * The functions here do the job until bootmem takes over.
++ *
++ *  Getting sanitize_e820_map() in sync with i386 version by applying change:
++ *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
++ *     Alex Achenbach <xela@slit.de>, December 2002.
++ *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
++ *
++ */
++#include <linux/kernel.h>
++#include <linux/types.h>
++#include <linux/init.h>
++#include <linux/crash_dump.h>
++#include <linux/bootmem.h>
++#include <linux/pfn.h>
++#include <linux/suspend.h>
++#include <linux/acpi.h>
++#include <linux/firmware-map.h>
++#include <linux/memblock.h>
++
++#include <asm/e820.h>
++#include <asm/proto.h>
++#include <asm/setup.h>
++#include <xen/interface/memory.h>
++
++/*
++ * The e820 map is the map that gets modified e.g. with command line parameters
++ * and that is also registered with modifications in the kernel resource tree
++ * with the iomem_resource as parent.
++ *
++ * The e820_saved is directly saved after the BIOS-provided memory map is
++ * copied. It doesn't get modified afterwards. It's registered for the
++ * /sys/firmware/memmap interface.
++ *
++ * That memory map is not modified and is used as base for kexec. The kexec'd
++ * kernel should get the same memory map as the firmware provides. Then the
++ * user can e.g. boot the original kernel with mem=1G while still booting the
++ * next kernel with full memory.
++ */
++struct e820map e820;
++#ifndef CONFIG_XEN
++struct e820map e820_saved;
++#else
++static struct e820map machine_e820;
++#define e820_saved machine_e820
++#endif
++
++/* For PCI or other memory-mapped resources */
++unsigned long pci_mem_start = 0xaeedbabe;
++#ifdef CONFIG_PCI
++EXPORT_SYMBOL(pci_mem_start);
++#endif
++
++/*
++ * This function checks if any part of the range <start,end> is mapped
++ * with type.
++ */
++int
++e820_any_mapped(u64 start, u64 end, unsigned type)
++{
++      int i;
++
++#ifndef CONFIG_XEN
++      for (i = 0; i < e820.nr_map; i++) {
++              struct e820entry *ei = &e820.map[i];
++#else
++      if (!is_initial_xendomain())
++              return 0;
++      for (i = 0; i < machine_e820.nr_map; ++i) {
++              const struct e820entry *ei = &machine_e820.map[i];
++#endif
++
++              if (type && ei->type != type)
++                      continue;
++              if (ei->addr >= end || ei->addr + ei->size <= start)
++                      continue;
++              return 1;
++      }
++      return 0;
++}
++EXPORT_SYMBOL_GPL(e820_any_mapped);
++
++/*
++ * This function checks if the entire range <start,end> is mapped with type.
++ *
++ * Note: this function only works correct if the e820 table is sorted and
++ * not-overlapping, which is the case
++ */
++int __init e820_all_mapped(u64 start, u64 end, unsigned type)
++{
++      int i;
++
++#ifndef CONFIG_XEN
++      for (i = 0; i < e820.nr_map; i++) {
++              struct e820entry *ei = &e820.map[i];
++#else
++      if (!is_initial_xendomain())
++              return 0;
++      for (i = 0; i < machine_e820.nr_map; ++i) {
++              const struct e820entry *ei = &machine_e820.map[i];
++#endif
++
++              if (type && ei->type != type)
++                      continue;
++              /* is the region (part) in overlap with the current region ?*/
++              if (ei->addr >= end || ei->addr + ei->size <= start)
++                      continue;
++
++              /* if the region is at the beginning of <start,end> we move
++               * start to the end of the region since it's ok until there
++               */
++              if (ei->addr <= start)
++                      start = ei->addr + ei->size;
++              /*
++               * if start is now at or beyond end, we're done, full
++               * coverage
++               */
++              if (start >= end)
++                      return 1;
++      }
++      return 0;
++}
++
++/*
++ * Add a memory region to the kernel e820 map.
++ */
++static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
++                                       int type)
++{
++      int x = e820x->nr_map;
++
++      if (x >= ARRAY_SIZE(e820x->map)) {
++              printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
++              return;
++      }
++
++      e820x->map[x].addr = start;
++      e820x->map[x].size = size;
++      e820x->map[x].type = type;
++      e820x->nr_map++;
++}
++
++void __init e820_add_region(u64 start, u64 size, int type)
++{
++      __e820_add_region(&e820, start, size, type);
++}
++
++static void __init e820_print_type(u32 type)
++{
++      switch (type) {
++      case E820_RAM:
++      case E820_RESERVED_KERN:
++              printk(KERN_CONT "(usable)");
++              break;
++      case E820_RESERVED:
++              printk(KERN_CONT "(reserved)");
++              break;
++      case E820_ACPI:
++              printk(KERN_CONT "(ACPI data)");
++              break;
++      case E820_NVS:
++              printk(KERN_CONT "(ACPI NVS)");
++              break;
++      case E820_UNUSABLE:
++              printk(KERN_CONT "(unusable)");
++              break;
++      default:
++              printk(KERN_CONT "type %u", type);
++              break;
++      }
++}
++
++static void __init _e820_print_map(const struct e820map *e820, const char *who)
++{
++      int i;
++
++      for (i = 0; i < e820->nr_map; i++) {
++              printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
++                     (unsigned long long) e820->map[i].addr,
++                     (unsigned long long)
++                     (e820->map[i].addr + e820->map[i].size));
++              e820_print_type(e820->map[i].type);
++              printk(KERN_CONT "\n");
++      }
++}
++
++/*
++ * Sanitize the BIOS e820 map.
++ *
++ * Some e820 responses include overlapping entries. The following
++ * replaces the original e820 map with a new one, removing overlaps,
++ * and resolving conflicting memory types in favor of highest
++ * numbered type.
++ *
++ * The input parameter biosmap points to an array of 'struct
++ * e820entry' which on entry has elements in the range [0, *pnr_map)
++ * valid, and which has space for up to max_nr_map entries.
++ * On return, the resulting sanitized e820 map entries will be in
++ * overwritten in the same location, starting at biosmap.
++ *
++ * The integer pointed to by pnr_map must be valid on entry (the
++ * current number of valid entries located at biosmap) and will
++ * be updated on return, with the new number of valid entries
++ * (something no more than max_nr_map.)
++ *
++ * The return value from sanitize_e820_map() is zero if it
++ * successfully 'sanitized' the map entries passed in, and is -1
++ * if it did nothing, which can happen if either of (1) it was
++ * only passed one map entry, or (2) any of the input map entries
++ * were invalid (start + size < start, meaning that the size was
++ * so big the described memory range wrapped around through zero.)
++ *
++ *    Visually we're performing the following
++ *    (1,2,3,4 = memory types)...
++ *
++ *    Sample memory map (w/overlaps):
++ *       ____22__________________
++ *       ______________________4_
++ *       ____1111________________
++ *       _44_____________________
++ *       11111111________________
++ *       ____________________33__
++ *       ___________44___________
++ *       __________33333_________
++ *       ______________22________
++ *       ___________________2222_
++ *       _________111111111______
++ *       _____________________11_
++ *       _________________4______
++ *
++ *    Sanitized equivalent (no overlap):
++ *       1_______________________
++ *       _44_____________________
++ *       ___1____________________
++ *       ____22__________________
++ *       ______11________________
++ *       _________1______________
++ *       __________3_____________
++ *       ___________44___________
++ *       _____________33_________
++ *       _______________2________
++ *       ________________1_______
++ *       _________________4______
++ *       ___________________2____
++ *       ____________________33__
++ *       ______________________4_
++ */
++
++int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
++                           u32 *pnr_map)
++{
++      struct change_member {
++              struct e820entry *pbios; /* pointer to original bios entry */
++              unsigned long long addr; /* address for this change point */
++      };
++      static struct change_member change_point_list[2*E820_X_MAX] __initdata;
++      static struct change_member *change_point[2*E820_X_MAX] __initdata;
++      static struct e820entry *overlap_list[E820_X_MAX] __initdata;
++      static struct e820entry new_bios[E820_X_MAX] __initdata;
++      struct change_member *change_tmp;
++      unsigned long current_type, last_type;
++      unsigned long long last_addr;
++      int chgidx, still_changing;
++      int overlap_entries;
++      int new_bios_entry;
++      int old_nr, new_nr, chg_nr;
++      int i;
++
++      /* if there's only one memory region, don't bother */
++#ifdef CONFIG_XEN
++      if (*pnr_map == 1)
++              return 0;
++#endif
++      if (*pnr_map < 2)
++              return -1;
++
++      old_nr = *pnr_map;
++      BUG_ON(old_nr > max_nr_map);
++
++      /* bail out if we find any unreasonable addresses in bios map */
++      for (i = 0; i < old_nr; i++)
++              if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
++                      return -1;
++
++      /* create pointers for initial change-point information (for sorting) */
++      for (i = 0; i < 2 * old_nr; i++)
++              change_point[i] = &change_point_list[i];
++
++      /* record all known change-points (starting and ending addresses),
++         omitting those that are for empty memory regions */
++      chgidx = 0;
++      for (i = 0; i < old_nr; i++)    {
++              if (biosmap[i].size != 0) {
++                      change_point[chgidx]->addr = biosmap[i].addr;
++                      change_point[chgidx++]->pbios = &biosmap[i];
++                      change_point[chgidx]->addr = biosmap[i].addr +
++                              biosmap[i].size;
++                      change_point[chgidx++]->pbios = &biosmap[i];
++              }
++      }
++      chg_nr = chgidx;
++
++      /* sort change-point list by memory addresses (low -> high) */
++      still_changing = 1;
++      while (still_changing)  {
++              still_changing = 0;
++              for (i = 1; i < chg_nr; i++)  {
++                      unsigned long long curaddr, lastaddr;
++                      unsigned long long curpbaddr, lastpbaddr;
++
++                      curaddr = change_point[i]->addr;
++                      lastaddr = change_point[i - 1]->addr;
++                      curpbaddr = change_point[i]->pbios->addr;
++                      lastpbaddr = change_point[i - 1]->pbios->addr;
++
++                      /*
++                       * swap entries, when:
++                       *
++                       * curaddr > lastaddr or
++                       * curaddr == lastaddr and curaddr == curpbaddr and
++                       * lastaddr != lastpbaddr
++                       */
++                      if (curaddr < lastaddr ||
++                          (curaddr == lastaddr && curaddr == curpbaddr &&
++                           lastaddr != lastpbaddr)) {
++                              change_tmp = change_point[i];
++                              change_point[i] = change_point[i-1];
++                              change_point[i-1] = change_tmp;
++                              still_changing = 1;
++                      }
++              }
++      }
++
++      /* create a new bios memory map, removing overlaps */
++      overlap_entries = 0;     /* number of entries in the overlap table */
++      new_bios_entry = 0;      /* index for creating new bios map entries */
++      last_type = 0;           /* start with undefined memory type */
++      last_addr = 0;           /* start with 0 as last starting address */
++
++      /* loop through change-points, determining affect on the new bios map */
++      for (chgidx = 0; chgidx < chg_nr; chgidx++) {
++              /* keep track of all overlapping bios entries */
++              if (change_point[chgidx]->addr ==
++                  change_point[chgidx]->pbios->addr) {
++                      /*
++                       * add map entry to overlap list (> 1 entry
++                       * implies an overlap)
++                       */
++                      overlap_list[overlap_entries++] =
++                              change_point[chgidx]->pbios;
++              } else {
++                      /*
++                       * remove entry from list (order independent,
++                       * so swap with last)
++                       */
++                      for (i = 0; i < overlap_entries; i++) {
++                              if (overlap_list[i] ==
++                                  change_point[chgidx]->pbios)
++                                      overlap_list[i] =
++                                              overlap_list[overlap_entries-1];
++                      }
++                      overlap_entries--;
++              }
++              /*
++               * if there are overlapping entries, decide which
++               * "type" to use (larger value takes precedence --
++               * 1=usable, 2,3,4,4+=unusable)
++               */
++              current_type = 0;
++              for (i = 0; i < overlap_entries; i++)
++                      if (overlap_list[i]->type > current_type)
++                              current_type = overlap_list[i]->type;
++              /*
++               * continue building up new bios map based on this
++               * information
++               */
++              if (current_type != last_type)  {
++                      if (last_type != 0)      {
++                              new_bios[new_bios_entry].size =
++                                      change_point[chgidx]->addr - last_addr;
++                              /*
++                               * move forward only if the new size
++                               * was non-zero
++                               */
++                              if (new_bios[new_bios_entry].size != 0)
++                                      /*
++                                       * no more space left for new
++                                       * bios entries ?
++                                       */
++                                      if (++new_bios_entry >= max_nr_map)
++                                              break;
++                      }
++                      if (current_type != 0)  {
++                              new_bios[new_bios_entry].addr =
++                                      change_point[chgidx]->addr;
++                              new_bios[new_bios_entry].type = current_type;
++                              last_addr = change_point[chgidx]->addr;
++                      }
++                      last_type = current_type;
++              }
++      }
++      /* retain count for new bios entries */
++      new_nr = new_bios_entry;
++
++      /* copy new bios mapping into original location */
++      memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
++      *pnr_map = new_nr;
++
++      return 0;
++}
++
++static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
++{
++      while (nr_map) {
++              u64 start = biosmap->addr;
++              u64 size = biosmap->size;
++              u64 end = start + size;
++              u32 type = biosmap->type;
++
++              /* Overflow in 64 bits? Ignore the memory map. */
++              if (start > end)
++                      return -1;
++
++              e820_add_region(start, size, type);
++
++              biosmap++;
++              nr_map--;
++      }
++      return 0;
++}
++
++/*
++ * Copy the BIOS e820 map into a safe place.
++ *
++ * Sanity-check it while we're at it..
++ *
++ * If we're lucky and live on a modern system, the setup code
++ * will have given us a memory map that we can use to properly
++ * set up memory.  If we aren't, we'll fake a memory map.
++ */
++static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
++{
++#ifndef CONFIG_XEN
++      /* Only one memory region (or negative)? Ignore it */
++      if (nr_map < 2)
++              return -1;
++#else
++      BUG_ON(nr_map < 1);
++#endif
++
++      return __append_e820_map(biosmap, nr_map);
++}
++
++static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
++                                      u64 size, unsigned old_type,
++                                      unsigned new_type)
++{
++      u64 end;
++      unsigned int i;
++      u64 real_updated_size = 0;
++
++      BUG_ON(old_type == new_type);
++
++      if (size > (ULLONG_MAX - start))
++              size = ULLONG_MAX - start;
++
++      end = start + size;
++      printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
++                     (unsigned long long) start,
++                     (unsigned long long) end);
++      e820_print_type(old_type);
++      printk(KERN_CONT " ==> ");
++      e820_print_type(new_type);
++      printk(KERN_CONT "\n");
++
++      for (i = 0; i < e820x->nr_map; i++) {
++              struct e820entry *ei = &e820x->map[i];
++              u64 final_start, final_end;
++              u64 ei_end;
++
++              if (ei->type != old_type)
++                      continue;
++
++              ei_end = ei->addr + ei->size;
++              /* totally covered by new range? */
++              if (ei->addr >= start && ei_end <= end) {
++                      ei->type = new_type;
++                      real_updated_size += ei->size;
++                      continue;
++              }
++
++              /* new range is totally covered? */
++              if (ei->addr < start && ei_end > end) {
++                      __e820_add_region(e820x, start, size, new_type);
++                      __e820_add_region(e820x, end, ei_end - end, ei->type);
++                      ei->size = start - ei->addr;
++                      real_updated_size += size;
++                      continue;
++              }
++
++              /* partially covered */
++              final_start = max(start, ei->addr);
++              final_end = min(end, ei_end);
++              if (final_start >= final_end)
++                      continue;
++
++              __e820_add_region(e820x, final_start, final_end - final_start,
++                                new_type);
++
++              real_updated_size += final_end - final_start;
++
++              /*
++               * left range could be head or tail, so need to update
++               * size at first.
++               */
++              ei->size -= final_end - final_start;
++              if (ei->addr < final_start)
++                      continue;
++              ei->addr = final_end;
++      }
++      return real_updated_size;
++}
++
++u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
++                           unsigned new_type)
++{
++      return __e820_update_range(&e820, start, size, old_type, new_type);
++}
++
++static u64 __init e820_update_range_saved(u64 start, u64 size,
++                                        unsigned old_type, unsigned new_type)
++{
++#ifdef CONFIG_XEN
++      if (is_initial_xendomain())
++              return __e820_update_range(&machine_e820,
++                                         phys_to_machine(start), size,
++                                         old_type, new_type);
++#endif
++      return __e820_update_range(&e820_saved, start, size, old_type,
++                                   new_type);
++}
++
++/* make e820 not cover the range */
++u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
++                           int checktype)
++{
++      int i;
++      u64 end;
++      u64 real_removed_size = 0;
++
++      if (size > (ULLONG_MAX - start))
++              size = ULLONG_MAX - start;
++
++      end = start + size;
++      printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
++                     (unsigned long long) start,
++                     (unsigned long long) end);
++      if (checktype)
++              e820_print_type(old_type);
++      printk(KERN_CONT "\n");
++
++      for (i = 0; i < e820.nr_map; i++) {
++              struct e820entry *ei = &e820.map[i];
++              u64 final_start, final_end;
++              u64 ei_end;
++
++              if (checktype && ei->type != old_type)
++                      continue;
++
++              ei_end = ei->addr + ei->size;
++              /* totally covered? */
++              if (ei->addr >= start && ei_end <= end) {
++                      real_removed_size += ei->size;
++                      memset(ei, 0, sizeof(struct e820entry));
++                      continue;
++              }
++
++              /* new range is totally covered? */
++              if (ei->addr < start && ei_end > end) {
++                      e820_add_region(end, ei_end - end, ei->type);
++                      ei->size = start - ei->addr;
++                      real_removed_size += size;
++                      continue;
++              }
++
++              /* partially covered */
++              final_start = max(start, ei->addr);
++              final_end = min(end, ei_end);
++              if (final_start >= final_end)
++                      continue;
++              real_removed_size += final_end - final_start;
++
++              /*
++               * left range could be head or tail, so need to update
++               * size at first.
++               */
++              ei->size -= final_end - final_start;
++              if (ei->addr < final_start)
++                      continue;
++              ei->addr = final_end;
++      }
++      return real_removed_size;
++}
++
++void __init update_e820(void)
++{
++      u32 nr_map;
++
++      nr_map = e820.nr_map;
++      if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
++              return;
++      e820.nr_map = nr_map;
++      printk(KERN_INFO "modified physical RAM map:\n");
++      _e820_print_map(&e820, "modified");
++}
++static void __init update_e820_saved(void)
++{
++      u32 nr_map;
++
++      nr_map = e820_saved.nr_map;
++      if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
++              return;
++      e820_saved.nr_map = nr_map;
++}
++
++#ifdef CONFIG_XEN
++#define e820 machine_e820
++#endif
++
++#define MAX_GAP_END 0x100000000ull
++/*
++ * Search for a gap in the e820 memory space from start_addr to end_addr.
++ */
++__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
++              unsigned long start_addr, unsigned long long end_addr)
++{
++      unsigned long long last;
++      int i = e820.nr_map;
++      int found = 0;
++
++      last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
++#ifdef CONFIG_X86_64
++      if (start_addr >= MAX_GAP_END)
++              last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
++#endif
++
++      while (--i >= 0) {
++              unsigned long long start = e820.map[i].addr;
++              unsigned long long end = start + e820.map[i].size;
++
++              if (end < start_addr)
++                      continue;
++
++              /*
++               * Since "last" is at most 4GB, we know we'll
++               * fit in 32 bits if this condition is true
++               */
++              if (last > end) {
++                      unsigned long gap = last - end;
++
++                      if (gap >= *gapsize) {
++                              *gapsize = gap;
++                              *gapstart = end;
++                              found = 1;
++                      }
++              }
++              if (start < last)
++                      last = start;
++      }
++      return found;
++}
++
++/*
++ * Search for the biggest gap in the low 32 bits of the e820
++ * memory space.  We pass this space to PCI to assign MMIO resources
++ * for hotplug or unconfigured devices in.
++ * Hopefully the BIOS let enough space left.
++ */
++__init void e820_setup_gap(void)
++{
++      unsigned long gapstart, gapsize;
++      int found;
++
++      gapstart = 0x10000000;
++      gapsize = 0x400000;
++      found  = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
++
++#ifdef CONFIG_X86_64
++      if (!found) {
++              printk(KERN_ERR
++      "PCI: Warning: Cannot find a gap in the 32bit address range\n"
++      "PCI: Unassigned devices with 32bit resource registers may break!\n");
++              found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
++              WARN_ON(!found);
++      }
++#endif
++
++      /*
++       * e820_reserve_resources_late protect stolen RAM already
++       */
++      pci_mem_start = gapstart;
++
++      printk(KERN_INFO
++             "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
++             pci_mem_start, gapstart, gapsize);
++}
++
++#undef e820
++
++#ifndef CONFIG_XEN
++/**
++ * Because of the size limitation of struct boot_params, only first
++ * 128 E820 memory entries are passed to kernel via
++ * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
++ * linked list of struct setup_data, which is parsed here.
++ */
++void __init parse_e820_ext(struct setup_data *sdata)
++{
++      int entries;
++      struct e820entry *extmap;
++
++      entries = sdata->len / sizeof(struct e820entry);
++      extmap = (struct e820entry *)(sdata->data);
++      __append_e820_map(extmap, entries);
++      sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
++      printk(KERN_INFO "extended physical RAM map:\n");
++      _e820_print_map(&e820, "extended");
++}
++
++#if defined(CONFIG_X86_64) || \
++      (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
++/**
++ * Find the ranges of physical addresses that do not correspond to
++ * e820 RAM areas and mark the corresponding pages as nosave for
++ * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
++ *
++ * This function requires the e820 map to be sorted and without any
++ * overlapping entries and assumes the first e820 area to be RAM.
++ */
++void __init e820_mark_nosave_regions(unsigned long limit_pfn)
++{
++      int i;
++      unsigned long pfn;
++
++      pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
++      for (i = 1; i < e820.nr_map; i++) {
++              struct e820entry *ei = &e820.map[i];
++
++              if (pfn < PFN_UP(ei->addr))
++                      register_nosave_region(pfn, PFN_UP(ei->addr));
++
++              pfn = PFN_DOWN(ei->addr + ei->size);
++              if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
++                      register_nosave_region(PFN_UP(ei->addr), pfn);
++
++              if (pfn >= limit_pfn)
++                      break;
++      }
++}
++#endif
++
++#ifdef CONFIG_HIBERNATION
++/**
++ * Mark ACPI NVS memory region, so that we can save/restore it during
++ * hibernation and the subsequent resume.
++ */
++static int __init e820_mark_nvs_memory(void)
++{
++      int i;
++
++      for (i = 0; i < e820.nr_map; i++) {
++              struct e820entry *ei = &e820.map[i];
++
++              if (ei->type == E820_NVS)
++                      suspend_nvs_register(ei->addr, ei->size);
++      }
++
++      return 0;
++}
++core_initcall(e820_mark_nvs_memory);
++#endif
++#endif
++
++/*
++ * pre allocated 4k and reserved it in memblock and e820_saved
++ */
++u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
++{
++      u64 size = 0;
++      u64 addr;
++      u64 start;
++#ifdef CONFIG_XEN
++      unsigned int order = get_order(sizet);
++
++      if (is_initial_xendomain()) {
++              sizet = PAGE_SIZE << order;
++              if (align < PAGE_SIZE)
++                      align = PAGE_SIZE;
++      }
++#endif
++      for (start = startt; ; start += size) {
++              start = memblock_x86_find_in_range_size(start, &size, align);
++              if (start == MEMBLOCK_ERROR)
++                      return 0;
++              if (size >= sizet)
++                      break;
++      }
++
++#ifdef CONFIG_X86_32
++      if (start >= MAXMEM)
++              return 0;
++      if (start + size > MAXMEM)
++              size = MAXMEM - start;
++#endif
++#ifdef CONFIG_XEN
++      if ((start >> PAGE_SHIFT) >= xen_start_info->nr_pages)
++              return 0;
++      if (PFN_UP(start + size) > xen_start_info->nr_pages)
++              size = ((u64)xen_start_info->nr_pages << PAGE_SHIFT) - start;
++#endif
++
++      addr = round_down(start + size - sizet, align);
++      if (addr < start)
++              return 0;
++#ifdef CONFIG_XEN
++      if (is_initial_xendomain()) {
++              int rc;
++              unsigned long max_initmap_pfn;
++
++              max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base))
++                                             + xen_start_info->nr_pt_frames
++                                             + 1 + (1 << (19 - PAGE_SHIFT)),
++                                      1UL << (22 - PAGE_SHIFT));
++#ifdef CONFIG_X86_32
++              if ((addr >> PAGE_SHIFT)
++                  < max(max_initmap_pfn, max_pfn_mapped))
++                      rc = xen_create_contiguous_region((unsigned long)
++                                                        __va(addr),
++                                                        order, 32);
++#else
++              if ((addr >> PAGE_SHIFT) < max_pfn_mapped)
++                      rc = xen_create_contiguous_region((unsigned long)
++                                                        __va(addr),
++                                                        order, 32);
++              else if ((addr >> PAGE_SHIFT) < max_initmap_pfn)
++                      rc = xen_create_contiguous_region(__START_KERNEL_map
++                                                        + addr,
++                                                        order, 32);
++#endif
++              else
++                      rc = early_create_contiguous_region(addr >> PAGE_SHIFT,
++                                                          order, 32);
++              if (rc)
++                      return 0;
++      }
++#endif
++      memblock_x86_reserve_range(addr, addr + sizet, "new next");
++      e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
++      printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
++      update_e820_saved();
++
++      return addr;
++}
++
++#ifdef CONFIG_X86_32
++# ifdef CONFIG_X86_PAE
++#  define MAX_ARCH_PFN                (1ULL<<(40-PAGE_SHIFT))
++# else
++#  define MAX_ARCH_PFN                (1ULL<<(32-PAGE_SHIFT))
++# endif
++#else /* CONFIG_X86_32 */
++# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
++#endif
++
++/*
++ * Find the highest page frame number we have available
++ */
++static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
++{
++      int i;
++      unsigned long last_pfn = 0;
++      unsigned long max_arch_pfn = MAX_ARCH_PFN;
++
++      for (i = 0; i < e820.nr_map; i++) {
++              struct e820entry *ei = &e820.map[i];
++              unsigned long start_pfn;
++              unsigned long end_pfn;
++
++              if (ei->type != type)
++                      continue;
++
++              start_pfn = ei->addr >> PAGE_SHIFT;
++              end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
++
++              if (start_pfn >= limit_pfn)
++                      continue;
++              if (end_pfn > limit_pfn) {
++                      last_pfn = limit_pfn;
++                      break;
++              }
++              if (end_pfn > last_pfn)
++                      last_pfn = end_pfn;
++      }
++
++      if (last_pfn > max_arch_pfn)
++              last_pfn = max_arch_pfn;
++
++      printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
++                       last_pfn, max_arch_pfn);
++      return last_pfn;
++}
++unsigned long __init e820_end_of_ram_pfn(void)
++{
++      return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
++}
++
++unsigned long __init e820_end_of_low_ram_pfn(void)
++{
++      return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
++}
++
++static void early_panic(char *msg)
++{
++      early_printk(msg);
++      panic(msg);
++}
++
++static int userdef __initdata;
++
++/* "mem=nopentium" disables the 4MB page tables. */
++static int __init parse_memopt(char *p)
++{
++      u64 mem_size, current_end;
++      unsigned int i;
++
++      if (!p)
++              return -EINVAL;
++
++#ifndef CONFIG_XEN
++      if (!strcmp(p, "nopentium")) {
++#ifdef CONFIG_X86_32
++              setup_clear_cpu_cap(X86_FEATURE_PSE);
++              return 0;
++#else
++              printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
++              return -EINVAL;
++#endif
++      }
++#endif
++
++      userdef = 1;
++      mem_size = memparse(p, &p);
++      /* don't remove all of memory when handling "mem={invalid}" param */
++      if (mem_size == 0)
++              return -EINVAL;
++#ifdef CONFIG_XEN
++      /*
++       * A little less than 2% of available memory are needed for page
++       * tables, p2m map, and mem_map. Hence the maximum amount of memory
++       * we can potentially balloon up to can in no case exceed about 50
++       * times of what we've been given initially. Since even with that we
++       * won't be able to boot (due to various calculations done based on
++       * the total number of pages) we further restrict this to factor 32.
++       */
++      if ((mem_size >> (PAGE_SHIFT + 5)) > xen_start_info->nr_pages) {
++              u64 size = (u64)xen_start_info->nr_pages << 5;
++
++              pr_warn("mem=%Luk is invalid for an initial"
++                      " allocation of %luk, using %Luk\n",
++                      (unsigned long long)mem_size >> 10,
++                      xen_start_info->nr_pages << (PAGE_SHIFT - 10),
++                      (unsigned long long)size << (PAGE_SHIFT - 10));
++              mem_size = size << PAGE_SHIFT;
++      }
++#endif
++      e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
++
++      i = e820.nr_map - 1;
++      current_end = e820.map[i].addr + e820.map[i].size;
++      if (current_end < mem_size) {
++              /*
++               * The e820 map ends before our requested size so
++               * extend the final entry to the requested address.
++               */
++              if (e820.map[i].type == E820_RAM)
++                      e820.map[i].size = mem_size - e820.map[i].addr;
++              else
++                      e820_add_region(current_end, mem_size - current_end, E820_RAM);
++      }
++
++      return 0;
++}
++early_param("mem", parse_memopt);
++
++#ifndef CONFIG_XEN
++static int __init parse_memmap_opt(char *p)
++{
++      char *oldp;
++      u64 start_at, mem_size;
++
++      if (!p)
++              return -EINVAL;
++
++      if (!strncmp(p, "exactmap", 8)) {
++#ifdef CONFIG_CRASH_DUMP
++              /*
++               * If we are doing a crash dump, we still need to know
++               * the real mem size before original memory map is
++               * reset.
++               */
++              saved_max_pfn = e820_end_of_ram_pfn();
++#endif
++              e820.nr_map = 0;
++              userdef = 1;
++              return 0;
++      }
++
++      oldp = p;
++      mem_size = memparse(p, &p);
++      if (p == oldp)
++              return -EINVAL;
++
++      userdef = 1;
++      if (*p == '@') {
++              start_at = memparse(p+1, &p);
++              e820_add_region(start_at, mem_size, E820_RAM);
++      } else if (*p == '#') {
++              start_at = memparse(p+1, &p);
++              e820_add_region(start_at, mem_size, E820_ACPI);
++      } else if (*p == '$') {
++              start_at = memparse(p+1, &p);
++              e820_add_region(start_at, mem_size, E820_RESERVED);
++      } else
++              e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
++
++      return *p == '\0' ? 0 : -EINVAL;
++}
++early_param("memmap", parse_memmap_opt);
++#endif
++
++void __init finish_e820_parsing(void)
++{
++      if (userdef) {
++              u32 nr = e820.nr_map;
++
++              if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
++                      early_panic("Invalid user supplied memory map");
++              e820.nr_map = nr;
++
++              printk(KERN_INFO "user-defined physical RAM map:\n");
++              _e820_print_map(&e820, "user");
++      }
++}
++
++static inline const char *e820_type_to_string(int e820_type)
++{
++      switch (e820_type) {
++      case E820_RESERVED_KERN:
++      case E820_RAM:  return "System RAM";
++      case E820_ACPI: return "ACPI Tables";
++      case E820_NVS:  return "ACPI Non-volatile Storage";
++      case E820_UNUSABLE:     return "Unusable memory";
++      default:        return "reserved";
++      }
++}
++
++#ifdef CONFIG_XEN
++#define e820 machine_e820
++#endif
++
++/*
++ * Mark e820 reserved areas as busy for the resource manager.
++ */
++static struct resource __initdata *e820_res;
++void __init e820_reserve_resources(void)
++{
++      int i;
++      struct resource *res;
++      u64 end;
++
++      res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
++      e820_res = res;
++      for (i = 0; i < e820.nr_map; i++) {
++              end = e820.map[i].addr + e820.map[i].size - 1;
++              if (end != (resource_size_t)end) {
++                      res++;
++                      continue;
++              }
++              res->name = e820_type_to_string(e820.map[i].type);
++              res->start = e820.map[i].addr;
++              res->end = end;
++
++              res->flags = IORESOURCE_MEM;
++
++              /*
++               * don't register the region that could be conflicted with
++               * pci device BAR resource and insert them later in
++               * pcibios_resource_survey()
++               */
++              if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
++                      res->flags |= IORESOURCE_BUSY;
++                      insert_resource(&iomem_resource, res);
++              }
++              res++;
++      }
++
++      for (i = 0; i < e820_saved.nr_map; i++) {
++              struct e820entry *entry = &e820_saved.map[i];
++              firmware_map_add_early(entry->addr,
++                      entry->addr + entry->size - 1,
++                      e820_type_to_string(entry->type));
++      }
++}
++
++/* How much should we pad RAM ending depending on where it is? */
++static unsigned long ram_alignment(resource_size_t pos)
++{
++      unsigned long mb = pos >> 20;
++
++      /* To 64kB in the first megabyte */
++      if (!mb)
++              return 64*1024;
++
++      /* To 1MB in the first 16MB */
++      if (mb < 16)
++              return 1024*1024;
++
++      /* To 64MB for anything above that */
++      return 64*1024*1024;
++}
++
++#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
++
++void __init e820_reserve_resources_late(void)
++{
++      int i;
++      struct resource *res;
++
++      res = e820_res;
++      for (i = 0; i < e820.nr_map; i++) {
++              if (!res->parent && res->end)
++                      insert_resource_expand_to_fit(&iomem_resource, res);
++              res++;
++      }
++
++      /*
++       * Try to bump up RAM regions to reasonable boundaries to
++       * avoid stolen RAM:
++       */
++      for (i = 0; i < e820.nr_map; i++) {
++              struct e820entry *entry = &e820.map[i];
++              u64 start, end;
++
++              if (entry->type != E820_RAM)
++                      continue;
++              start = entry->addr + entry->size;
++              end = round_up(start, ram_alignment(start)) - 1;
++              if (end > MAX_RESOURCE_SIZE)
++                      end = MAX_RESOURCE_SIZE;
++              if (start >= end)
++                      continue;
++              printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
++                             start, end);
++              reserve_region_with_split(&iomem_resource, start, end,
++                                        "RAM buffer");
++      }
++}
++
++#undef e820
++
++char *__init default_machine_specific_memory_setup(void)
++{
++      int rc, nr_map;
++      unsigned long long maxmem;
++      struct xen_memory_map memmap;
++      static struct e820entry __initdata map[E820MAX];
++
++      memmap.nr_entries = E820MAX;
++      set_xen_guest_handle(memmap.buffer, map);
++
++      rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
++      if (rc == -ENOSYS) {
++              memmap.nr_entries = 1;
++              map[0].addr = 0ULL;
++              map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
++              /* 8MB slack (to balance backend allocations). */
++              map[0].size += 8ULL << 20;
++              map[0].type = E820_RAM;
++              rc = 0;
++      }
++      BUG_ON(rc);
++
++      nr_map = memmap.nr_entries;
++      sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
++
++      if (append_e820_map(map, nr_map) < 0)
++              BUG();
++
++#ifdef CONFIG_XEN
++      /* See the comment in parse_memopt(). */
++      for (maxmem = rc = 0; rc < e820.nr_map; ++rc)
++              if (e820.map[rc].type == E820_RAM)
++                      maxmem += e820.map[rc].size;
++      if ((maxmem >> (PAGE_SHIFT + 5)) > xen_start_info->nr_pages) {
++              unsigned long long size = (u64)xen_start_info->nr_pages << 5;
++
++              pr_warn("maxmem of %LuM is invalid for an initial"
++                      " allocation of %luM, using %LuM\n",
++                      maxmem >> 20,
++                      xen_start_info->nr_pages >> (20 - PAGE_SHIFT),
++                      size >> (20 - PAGE_SHIFT));
++              size <<= PAGE_SHIFT;
++              e820_remove_range(size, ULLONG_MAX - size, E820_RAM, 1);
++      }
++
++      if (is_initial_xendomain()) {
++              memmap.nr_entries = E820MAX;
++              set_xen_guest_handle(memmap.buffer, machine_e820.map);
++
++              if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
++                      BUG();
++              machine_e820.nr_map = memmap.nr_entries;
++      }
++#endif
++
++      return "Xen";
++}
++
++void __init setup_memory_map(void)
++{
++      char *who;
++
++      who = x86_init.resources.memory_setup();
++#ifdef CONFIG_XEN
++      if (is_initial_xendomain()) {
++              printk(KERN_INFO "Xen-provided machine memory map:\n");
++              _e820_print_map(&machine_e820, "BIOS");
++      } else
++#endif
++              memcpy(&e820_saved, &e820, sizeof(struct e820map));
++      printk(KERN_INFO "Xen-provided physical RAM map:\n");
++      _e820_print_map(&e820, who);
++}
++
++void __init memblock_x86_fill(void)
++{
++      int i;
++      u64 end;
++
++      /*
++       * EFI may have more than 128 entries
++       * We are safe to enable resizing, beause memblock_x86_fill()
++       * is rather later for x86
++       */
++      memblock_can_resize = 1;
++
++      for (i = 0; i < e820.nr_map; i++) {
++              struct e820entry *ei = &e820.map[i];
++
++              end = ei->addr + ei->size;
++              if (end != (resource_size_t)end)
++                      continue;
++
++              if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
++                      continue;
++
++              memblock_add(ei->addr, ei->size);
++      }
++
++      memblock_analyze();
++      memblock_dump_all();
++}
++
++void __init memblock_find_dma_reserve(void)
++{
++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
++      u64 free_size_pfn;
++      u64 mem_size_pfn;
++      /*
++       * need to find out used area below MAX_DMA_PFN
++       * need to use memblock to get free size in [0, MAX_DMA_PFN]
++       * at first, and assume boot_mem will not take below MAX_DMA_PFN
++       */
++      mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
++      free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
++      set_dma_reserve(mem_size_pfn - free_size_pfn);
++#endif
++}
diff --cc arch/x86/kernel/early_printk-xen.c

index 0000000,0000000..e8b49af

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/early_printk-xen.c
@@@ -1,0 -1,0 +1,291 @@@
++#include <linux/console.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/string.h>
++#include <linux/screen_info.h>
++#include <linux/usb/ch9.h>
++#include <linux/pci_regs.h>
++#include <linux/pci_ids.h>
++#include <linux/errno.h>
++#include <asm/io.h>
++#include <asm/processor.h>
++#include <asm/fcntl.h>
++#include <asm/setup.h>
++#include <asm/pci-direct.h>
++#include <asm/fixmap.h>
++#include <asm/mrst.h>
++#include <asm/pgtable.h>
++#include <linux/usb/ehci_def.h>
++
++#ifndef CONFIG_XEN
++/* Simple VGA output */
++#define VGABASE               (__ISA_IO_base + 0xb8000)
++
++static int max_ypos = 25, max_xpos = 80;
++static int current_ypos = 25, current_xpos;
++
++static void early_vga_write(struct console *con, const char *str, unsigned n)
++{
++      char c;
++      int  i, k, j;
++
++      while ((c = *str++) != '\0' && n-- > 0) {
++              if (current_ypos >= max_ypos) {
++                      /* scroll 1 line up */
++                      for (k = 1, j = 0; k < max_ypos; k++, j++) {
++                              for (i = 0; i < max_xpos; i++) {
++                                      writew(readw(VGABASE+2*(max_xpos*k+i)),
++                                             VGABASE + 2*(max_xpos*j + i));
++                              }
++                      }
++                      for (i = 0; i < max_xpos; i++)
++                              writew(0x720, VGABASE + 2*(max_xpos*j + i));
++                      current_ypos = max_ypos-1;
++              }
++#ifdef CONFIG_KGDB_KDB
++              if (c == '\b') {
++                      if (current_xpos > 0)
++                              current_xpos--;
++              } else if (c == '\r') {
++                      current_xpos = 0;
++              } else
++#endif
++              if (c == '\n') {
++                      current_xpos = 0;
++                      current_ypos++;
++              } else if (c != '\r')  {
++                      writew(((0x7 << 8) | (unsigned short) c),
++                             VGABASE + 2*(max_xpos*current_ypos +
++                                              current_xpos++));
++                      if (current_xpos >= max_xpos) {
++                              current_xpos = 0;
++                              current_ypos++;
++                      }
++              }
++      }
++}
++
++static struct console early_vga_console = {
++      .name =         "earlyvga",
++      .write =        early_vga_write,
++      .flags =        CON_PRINTBUFFER,
++      .index =        -1,
++};
++
++/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
++
++static int early_serial_base = 0x3f8;  /* ttyS0 */
++
++#define XMTRDY          0x20
++
++#define DLAB          0x80
++
++#define TXR             0       /*  Transmit register (WRITE) */
++#define RXR             0       /*  Receive register  (READ)  */
++#define IER             1       /*  Interrupt Enable          */
++#define IIR             2       /*  Interrupt ID              */
++#define FCR             2       /*  FIFO control              */
++#define LCR             3       /*  Line control              */
++#define MCR             4       /*  Modem control             */
++#define LSR             5       /*  Line Status               */
++#define MSR             6       /*  Modem Status              */
++#define DLL             0       /*  Divisor Latch Low         */
++#define DLH             1       /*  Divisor latch High        */
++
++static int early_serial_putc(unsigned char ch)
++{
++      unsigned timeout = 0xffff;
++
++      while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
++              cpu_relax();
++      outb(ch, early_serial_base + TXR);
++      return timeout ? 0 : -1;
++}
++
++static void early_serial_write(struct console *con, const char *s, unsigned n)
++{
++      while (*s && n-- > 0) {
++              if (*s == '\n')
++                      early_serial_putc('\r');
++              early_serial_putc(*s);
++              s++;
++      }
++}
++
++#define DEFAULT_BAUD 9600
++
++static __init void early_serial_init(char *s)
++{
++      unsigned char c;
++      unsigned divisor;
++      unsigned baud = DEFAULT_BAUD;
++      char *e;
++
++      if (*s == ',')
++              ++s;
++
++      if (*s) {
++              unsigned port;
++              if (!strncmp(s, "0x", 2)) {
++                      early_serial_base = simple_strtoul(s, &e, 16);
++              } else {
++                      static const int __initconst bases[] = { 0x3f8, 0x2f8 };
++
++                      if (!strncmp(s, "ttyS", 4))
++                              s += 4;
++                      port = simple_strtoul(s, &e, 10);
++                      if (port > 1 || s == e)
++                              port = 0;
++                      early_serial_base = bases[port];
++              }
++              s += strcspn(s, ",");
++              if (*s == ',')
++                      s++;
++      }
++
++      outb(0x3, early_serial_base + LCR);     /* 8n1 */
++      outb(0, early_serial_base + IER);       /* no interrupt */
++      outb(0, early_serial_base + FCR);       /* no fifo */
++      outb(0x3, early_serial_base + MCR);     /* DTR + RTS */
++
++      if (*s) {
++              baud = simple_strtoul(s, &e, 0);
++              if (baud == 0 || s == e)
++                      baud = DEFAULT_BAUD;
++      }
++
++      divisor = 115200 / baud;
++      c = inb(early_serial_base + LCR);
++      outb(c | DLAB, early_serial_base + LCR);
++      outb(divisor & 0xff, early_serial_base + DLL);
++      outb((divisor >> 8) & 0xff, early_serial_base + DLH);
++      outb(c & ~DLAB, early_serial_base + LCR);
++}
++
++#else /* CONFIG_XEN */
++
++static void
++early_serial_write(struct console *con, const char *s, unsigned count)
++{
++      int n;
++
++      while (count > 0) {
++              n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
++              if (n <= 0)
++                      break;
++              count -= n;
++              s += n;
++      }
++} 
++
++static __init void early_serial_init(char *s)
++{
++}
++
++/*
++ * No early VGA console on Xen, as we do not have convenient ISA-space
++ * mappings. Someone should fix this for domain 0. For now, use fake serial.
++ */
++#define early_vga_console early_serial_console
++
++#endif
++
++static struct console early_serial_console = {
++      .name =         "earlyser",
++      .write =        early_serial_write,
++      .flags =        CON_PRINTBUFFER,
++      .index =        -1,
++};
++
++/* Direct interface for emergencies */
++static struct console *early_console = &early_vga_console;
++static int __initdata early_console_initialized;
++
++asmlinkage void early_printk(const char *fmt, ...)
++{
++      char buf[512];
++      int n;
++      va_list ap;
++
++      va_start(ap, fmt);
++      n = vscnprintf(buf, sizeof(buf), fmt, ap);
++      early_console->write(early_console, buf, n);
++      va_end(ap);
++}
++
++static inline void early_console_register(struct console *con, int keep_early)
++{
++      if (early_console->index != -1) {
++              printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
++                     con->name);
++              return;
++      }
++      early_console = con;
++      if (keep_early)
++              early_console->flags &= ~CON_BOOT;
++      else
++              early_console->flags |= CON_BOOT;
++      register_console(early_console);
++}
++
++static int __init setup_early_printk(char *buf)
++{
++      int keep;
++
++      if (!buf)
++              return 0;
++
++      if (early_console_initialized)
++              return 0;
++      early_console_initialized = 1;
++
++      keep = (strstr(buf, "keep") != NULL);
++
++      while (*buf != '\0') {
++              if (!strncmp(buf, "serial", 6)) {
++                      buf += 6;
++                      early_serial_init(buf);
++                      early_console_register(&early_serial_console, keep);
++                      if (!strncmp(buf, ",ttyS", 5))
++                              buf += 5;
++              }
++              if (!strncmp(buf, "ttyS", 4)) {
++                      early_serial_init(buf + 4);
++                      early_console_register(&early_serial_console, keep);
++              }
++#ifndef CONFIG_XEN
++              if (!strncmp(buf, "vga", 3) &&
++                  boot_params.screen_info.orig_video_isVGA == 1) {
++                      max_xpos = boot_params.screen_info.orig_video_cols;
++                      max_ypos = boot_params.screen_info.orig_video_lines;
++                      current_ypos = boot_params.screen_info.orig_y;
++#else
++              if (!strncmp(buf, "vga", 3) || !strncmp(buf, "xen", 3)) {
++#endif
++                      early_console_register(&early_vga_console, keep);
++              }
++#ifdef CONFIG_EARLY_PRINTK_DBGP
++              if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
++                      early_console_register(&early_dbgp_console, keep);
++#endif
++#ifdef CONFIG_HVC_XEN
++              if (!strncmp(buf, "xen", 3))
++                      early_console_register(&xenboot_console, keep);
++#endif
++#ifdef CONFIG_EARLY_PRINTK_MRST
++              if (!strncmp(buf, "mrst", 4)) {
++                      mrst_early_console_init();
++                      early_console_register(&early_mrst_console, keep);
++              }
++
++              if (!strncmp(buf, "hsu", 3)) {
++                      hsu_early_console_init();
++                      early_console_register(&early_hsu_console, keep);
++              }
++#endif
++              buf++;
++      }
++      return 0;
++}
++
++early_param("earlyprintk", setup_early_printk);
diff --cc arch/x86/kernel/entry_32-xen.S

index 0000000,0000000..7f433db

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/entry_32-xen.S
@@@ -1,0 -1,0 +1,1736 @@@
++/*
++ *
++ *  Copyright (C) 1991, 1992  Linus Torvalds
++ */
++
++/*
++ * entry.S contains the system-call and fault low-level handling routines.
++ * This also contains the timer-interrupt handler, as well as all interrupts
++ * and faults that can result in a task-switch.
++ *
++ * NOTE: This code handles signal-recognition, which happens every time
++ * after a timer-interrupt and after each system call.
++ *
++ * I changed all the .align's to 4 (16 byte alignment), as that's faster
++ * on a 486.
++ *
++ * Stack layout in 'syscall_exit':
++ *    ptrace needs to have all regs on the stack.
++ *    if the order here is changed, it needs to be
++ *    updated in fork.c:copy_process, signal.c:do_signal,
++ *    ptrace.c and ptrace.h
++ *
++ *     0(%esp) - %ebx
++ *     4(%esp) - %ecx
++ *     8(%esp) - %edx
++ *       C(%esp) - %esi
++ *    10(%esp) - %edi
++ *    14(%esp) - %ebp
++ *    18(%esp) - %eax
++ *    1C(%esp) - %ds
++ *    20(%esp) - %es
++ *    24(%esp) - %fs
++ *    28(%esp) - %gs          saved iff !CONFIG_X86_32_LAZY_GS
++ *    2C(%esp) - orig_eax
++ *    30(%esp) - %eip
++ *    34(%esp) - %cs
++ *    38(%esp) - %eflags
++ *    3C(%esp) - %oldesp
++ *    40(%esp) - %oldss
++ *
++ * "current" is in register %ebx during any slow entries.
++ */
++
++#include <linux/linkage.h>
++#include <asm/thread_info.h>
++#include <asm/irqflags.h>
++#include <asm/errno.h>
++#include <asm/segment.h>
++#include <asm/smp.h>
++#include <asm/page_types.h>
++#include <asm/percpu.h>
++#include <asm/dwarf2.h>
++#include <asm/processor-flags.h>
++#include <asm/ftrace.h>
++#include <asm/irq_vectors.h>
++#include <asm/cpufeature.h>
++#include <xen/interface/xen.h>
++
++/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
++#include <linux/elf-em.h>
++#define AUDIT_ARCH_I386               (EM_386|__AUDIT_ARCH_LE)
++#define __AUDIT_ARCH_LE          0x40000000
++
++#ifndef CONFIG_AUDITSYSCALL
++#define sysenter_audit        syscall_trace_entry
++#define sysexit_audit syscall_exit_work
++#endif
++
++      .section .entry.text, "ax"
++
++/*
++ * We use macros for low-level operations which need to be overridden
++ * for paravirtualization.  The following will never clobber any registers:
++ *   INTERRUPT_RETURN (aka. "iret")
++ *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
++ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
++ *
++ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
++ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
++ * Allowing a register to be clobbered can shrink the paravirt replacement
++ * enough to patch inline, increasing performance.
++ */
++
++#define nr_syscalls ((syscall_table_size)/4)
++
++/* Pseudo-eflags. */
++NMI_MASK      = 0x80000000
++
++#ifdef CONFIG_PREEMPT
++#define preempt_stop(clobbers)        DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
++#else
++#define preempt_stop(clobbers)
++#define resume_kernel         restore_all
++#endif
++
++.macro TRACE_IRQS_IRET
++#ifdef CONFIG_TRACE_IRQFLAGS
++      testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)     # interrupts off?
++      jz 1f
++      TRACE_IRQS_ON
++1:
++#endif
++.endm
++
++#ifdef CONFIG_VM86
++#define resume_userspace_sig  check_userspace
++#else
++#define resume_userspace_sig  resume_userspace
++#endif
++
++/*
++ * User gs save/restore
++ *
++ * %gs is used for userland TLS and kernel only uses it for stack
++ * canary which is required to be at %gs:20 by gcc.  Read the comment
++ * at the top of stackprotector.h for more info.
++ *
++ * Local labels 98 and 99 are used.
++ */
++#ifdef CONFIG_X86_32_LAZY_GS
++
++ /* unfortunately push/pop can't be no-op */
++.macro PUSH_GS
++      pushl_cfi $0
++.endm
++.macro POP_GS pop=0
++      addl $(4 + \pop), %esp
++      CFI_ADJUST_CFA_OFFSET -(4 + \pop)
++.endm
++.macro POP_GS_EX
++.endm
++
++ /* all the rest are no-op */
++.macro PTGS_TO_GS
++.endm
++.macro PTGS_TO_GS_EX
++.endm
++.macro GS_TO_REG reg
++.endm
++.macro REG_TO_PTGS reg
++.endm
++.macro SET_KERNEL_GS reg
++.endm
++
++#else /* CONFIG_X86_32_LAZY_GS */
++
++.macro PUSH_GS
++      pushl_cfi %gs
++      /*CFI_REL_OFFSET gs, 0*/
++.endm
++
++.macro POP_GS pop=0
++98:   popl_cfi %gs
++      /*CFI_RESTORE gs*/
++  .if \pop <> 0
++      add $\pop, %esp
++      CFI_ADJUST_CFA_OFFSET -\pop
++  .endif
++.endm
++.macro POP_GS_EX
++.pushsection .fixup, "ax"
++99:   movl $0, (%esp)
++      jmp 98b
++.section __ex_table, "a"
++      .align 4
++      .long 98b, 99b
++.popsection
++.endm
++
++.macro PTGS_TO_GS
++98:   mov PT_GS(%esp), %gs
++.endm
++.macro PTGS_TO_GS_EX
++.pushsection .fixup, "ax"
++99:   movl $0, PT_GS(%esp)
++      jmp 98b
++.section __ex_table, "a"
++      .align 4
++      .long 98b, 99b
++.popsection
++.endm
++
++.macro GS_TO_REG reg
++      movl %gs, \reg
++      /*CFI_REGISTER gs, \reg*/
++.endm
++.macro REG_TO_PTGS reg
++      movl \reg, PT_GS(%esp)
++      /*CFI_REL_OFFSET gs, PT_GS*/
++.endm
++.macro SET_KERNEL_GS reg
++      movl $(__KERNEL_STACK_CANARY), \reg
++      movl \reg, %gs
++.endm
++
++#endif        /* CONFIG_X86_32_LAZY_GS */
++
++.macro SAVE_ALL
++      cld
++      PUSH_GS
++      pushl_cfi %fs
++      /*CFI_REL_OFFSET fs, 0;*/
++      pushl_cfi %es
++      /*CFI_REL_OFFSET es, 0;*/
++      pushl_cfi %ds
++      /*CFI_REL_OFFSET ds, 0;*/
++      pushl_cfi %eax
++      CFI_REL_OFFSET eax, 0
++      pushl_cfi %ebp
++      CFI_REL_OFFSET ebp, 0
++      pushl_cfi %edi
++      CFI_REL_OFFSET edi, 0
++      pushl_cfi %esi
++      CFI_REL_OFFSET esi, 0
++      pushl_cfi %edx
++      CFI_REL_OFFSET edx, 0
++      pushl_cfi %ecx
++      CFI_REL_OFFSET ecx, 0
++      pushl_cfi %ebx
++      CFI_REL_OFFSET ebx, 0
++      movl $(__USER_DS), %edx
++      movl %edx, %ds
++      movl %edx, %es
++      movl $(__KERNEL_PERCPU), %edx
++      movl %edx, %fs
++      SET_KERNEL_GS %edx
++.endm
++
++.macro RESTORE_INT_REGS
++      popl_cfi %ebx
++      CFI_RESTORE ebx
++      popl_cfi %ecx
++      CFI_RESTORE ecx
++      popl_cfi %edx
++      CFI_RESTORE edx
++      popl_cfi %esi
++      CFI_RESTORE esi
++      popl_cfi %edi
++      CFI_RESTORE edi
++      popl_cfi %ebp
++      CFI_RESTORE ebp
++      popl_cfi %eax
++      CFI_RESTORE eax
++.endm
++
++.macro RESTORE_REGS pop=0
++      RESTORE_INT_REGS
++1:    popl_cfi %ds
++      /*CFI_RESTORE ds;*/
++2:    popl_cfi %es
++      /*CFI_RESTORE es;*/
++3:    popl_cfi %fs
++      /*CFI_RESTORE fs;*/
++      POP_GS \pop
++.pushsection .fixup, "ax"
++4:    movl $0, (%esp)
++      jmp 1b
++5:    movl $0, (%esp)
++      jmp 2b
++6:    movl $0, (%esp)
++      jmp 3b
++.section __ex_table, "a"
++      .align 4
++      .long 1b, 4b
++      .long 2b, 5b
++      .long 3b, 6b
++.popsection
++      POP_GS_EX
++.endm
++
++.macro RING0_INT_FRAME
++      CFI_STARTPROC simple
++      CFI_SIGNAL_FRAME
++      CFI_DEF_CFA esp, 3*4
++      /*CFI_OFFSET cs, -2*4;*/
++      CFI_OFFSET eip, -3*4
++.endm
++
++.macro RING0_EC_FRAME
++      CFI_STARTPROC simple
++      CFI_SIGNAL_FRAME
++      CFI_DEF_CFA esp, 4*4
++      /*CFI_OFFSET cs, -2*4;*/
++      CFI_OFFSET eip, -3*4
++.endm
++
++.macro RING0_PTREGS_FRAME
++      CFI_STARTPROC simple
++      CFI_SIGNAL_FRAME
++      CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
++      /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
++      CFI_OFFSET eip, PT_EIP-PT_OLDESP
++      /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
++      /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
++      CFI_OFFSET eax, PT_EAX-PT_OLDESP
++      CFI_OFFSET ebp, PT_EBP-PT_OLDESP
++      CFI_OFFSET edi, PT_EDI-PT_OLDESP
++      CFI_OFFSET esi, PT_ESI-PT_OLDESP
++      CFI_OFFSET edx, PT_EDX-PT_OLDESP
++      CFI_OFFSET ecx, PT_ECX-PT_OLDESP
++      CFI_OFFSET ebx, PT_EBX-PT_OLDESP
++.endm
++
++ENTRY(ret_from_fork)
++      CFI_STARTPROC
++      pushl_cfi %eax
++      call schedule_tail
++      GET_THREAD_INFO(%ebp)
++      popl_cfi %eax
++      pushl_cfi $0x0202               # Reset kernel eflags
++      popfl_cfi
++      jmp syscall_exit
++      CFI_ENDPROC
++END(ret_from_fork)
++
++/*
++ * Interrupt exit functions should be protected against kprobes
++ */
++      .pushsection .kprobes.text, "ax"
++/*
++ * Return to user mode is not as complex as all this looks,
++ * but we want the default path for a system call return to
++ * go as quickly as possible which is why some of this is
++ * less clear than it otherwise should be.
++ */
++
++      # userspace resumption stub bypassing syscall exit tracing
++      ALIGN
++      RING0_PTREGS_FRAME
++ret_from_exception:
++      preempt_stop(CLBR_ANY)
++ret_from_intr:
++      GET_THREAD_INFO(%ebp)
++check_userspace:
++      movl PT_EFLAGS(%esp), %eax      # mix EFLAGS and CS
++      movb PT_CS(%esp), %al
++      andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
++      cmpl $USER_RPL, %eax
++      jb resume_kernel                # not returning to v8086 or userspace
++
++ENTRY(resume_userspace)
++      LOCKDEP_SYS_EXIT
++      DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
++                                      # setting need_resched or sigpending
++                                      # between sampling and the iret
++      TRACE_IRQS_OFF
++      movl TI_flags(%ebp), %ecx
++      andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
++                                      # int/exception return?
++      jne work_pending
++      jmp restore_all
++END(ret_from_exception)
++
++#ifdef CONFIG_PREEMPT
++ENTRY(resume_kernel)
++      DISABLE_INTERRUPTS(CLBR_ANY)
++      cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
++      jnz restore_all
++need_resched:
++      movl TI_flags(%ebp), %ecx       # need_resched set ?
++      testb $_TIF_NEED_RESCHED, %cl
++      jz restore_all
++      testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)    # interrupts off (exception path) ?
++      jz restore_all
++      call preempt_schedule_irq
++      jmp need_resched
++END(resume_kernel)
++#endif
++      CFI_ENDPROC
++/*
++ * End of kprobes section
++ */
++      .popsection
++
++/* SYSENTER_RETURN points to after the "sysenter" instruction in
++   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
++
++      # sysenter call handler stub
++ENTRY(ia32_sysenter_target)
++      CFI_STARTPROC simple
++      CFI_SIGNAL_FRAME
++      CFI_DEF_CFA esp, 0
++      CFI_REGISTER esp, ebp
++      movl SYSENTER_stack_sp0(%esp),%esp
++sysenter_past_esp:
++      /*
++       * Interrupts are disabled here, but we can't trace it until
++       * enough kernel state to call TRACE_IRQS_OFF can be called - but
++       * we immediately enable interrupts at that point anyway.
++       */
++      pushl_cfi $__USER_DS
++      /*CFI_REL_OFFSET ss, 0*/
++      pushl_cfi %ebp
++      CFI_REL_OFFSET esp, 0
++      pushfl_cfi
++      orl $X86_EFLAGS_IF, (%esp)
++      pushl_cfi $__USER_CS
++      /*CFI_REL_OFFSET cs, 0*/
++      /*
++       * Push current_thread_info()->sysenter_return to the stack.
++       * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
++       * pushed above; +8 corresponds to copy_thread's esp0 setting.
++       */
++      pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
++      CFI_REL_OFFSET eip, 0
++
++      pushl_cfi %eax
++      SAVE_ALL
++      ENABLE_INTERRUPTS(CLBR_NONE)
++
++/*
++ * Load the potential sixth argument from user stack.
++ * Careful about security.
++ */
++      cmpl $__PAGE_OFFSET-3,%ebp
++      jae syscall_fault
++1:    movl (%ebp),%ebp
++      movl %ebp,PT_EBP(%esp)
++.section __ex_table,"a"
++      .align 4
++      .long 1b,syscall_fault
++.previous
++
++      GET_THREAD_INFO(%ebp)
++
++      testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
++      jnz sysenter_audit
++sysenter_do_call:
++      cmpl $(nr_syscalls), %eax
++      jae syscall_badsys
++      call *sys_call_table(,%eax,4)
++      movl %eax,PT_EAX(%esp)
++      LOCKDEP_SYS_EXIT
++      DISABLE_INTERRUPTS(CLBR_ANY)
++      TRACE_IRQS_OFF
++      movl TI_flags(%ebp), %ecx
++      testl $_TIF_ALLWORK_MASK, %ecx
++      jne sysexit_audit
++sysenter_exit:
++/* if something modifies registers it must also disable sysexit */
++      movl PT_EIP(%esp), %edx
++      movl PT_OLDESP(%esp), %ecx
++      xorl %ebp,%ebp
++#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
++      GET_VCPU_INFO
++#endif
++      TRACE_IRQS_ON
++1:    mov  PT_FS(%esp), %fs
++      PTGS_TO_GS
++      ENABLE_INTERRUPTS_SYSEXIT
++
++#ifdef CONFIG_AUDITSYSCALL
++sysenter_audit:
++      testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
++      jnz syscall_trace_entry
++      addl $4,%esp
++      CFI_ADJUST_CFA_OFFSET -4
++      /* %esi already in 8(%esp)         6th arg: 4th syscall arg */
++      /* %edx already in 4(%esp)         5th arg: 3rd syscall arg */
++      /* %ecx already in 0(%esp)         4th arg: 2nd syscall arg */
++      movl %ebx,%ecx                  /* 3rd arg: 1st syscall arg */
++      movl %eax,%edx                  /* 2nd arg: syscall number */
++      movl $AUDIT_ARCH_I386,%eax      /* 1st arg: audit arch */
++      call audit_syscall_entry
++      pushl_cfi %ebx
++      movl PT_EAX(%esp),%eax          /* reload syscall number */
++      jmp sysenter_do_call
++
++sysexit_audit:
++      testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
++      jne syscall_exit_work
++      TRACE_IRQS_ON
++      ENABLE_INTERRUPTS(CLBR_ANY)
++      movl %eax,%edx          /* second arg, syscall return value */
++      cmpl $0,%eax            /* is it < 0? */
++      setl %al                /* 1 if so, 0 if not */
++      movzbl %al,%eax         /* zero-extend that */
++      inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
++      call audit_syscall_exit
++      DISABLE_INTERRUPTS(CLBR_ANY)
++      TRACE_IRQS_OFF
++      movl TI_flags(%ebp), %ecx
++      testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
++      jne syscall_exit_work
++      movl PT_EAX(%esp),%eax  /* reload syscall return value */
++      jmp sysenter_exit
++#endif
++
++      CFI_ENDPROC
++.pushsection .fixup,"ax"
++2:    movl $0,PT_FS(%esp)
++      jmp 1b
++.section __ex_table,"a"
++      .align 4
++      .long 1b,2b
++.popsection
++      PTGS_TO_GS_EX
++ENDPROC(ia32_sysenter_target)
++
++      # pv sysenter call handler stub
++ENTRY(ia32pv_sysenter_target)
++      RING0_INT_FRAME
++      movl $__USER_DS,16(%esp)
++      movl %ebp,12(%esp)
++      movl $__USER_CS,4(%esp)
++      addl $4,%esp
++      CFI_ADJUST_CFA_OFFSET -4
++      /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */
++      pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
++/*
++ * Load the potential sixth argument from user stack.
++ * Careful about security.
++ */
++      cmpl $__PAGE_OFFSET-3,%ebp
++      jae syscall_fault
++1:    movl (%ebp),%ebp
++.section __ex_table,"a"
++      .align 4
++      .long 1b,syscall_fault
++.previous
++      jmp system_call
++      CFI_ENDPROC
++ENDPROC(ia32pv_sysenter_target)
++
++/*
++ * syscall stub including irq exit should be protected against kprobes
++ */
++      .pushsection .kprobes.text, "ax"
++      # system call handler stub
++ENTRY(system_call)
++      RING0_INT_FRAME                 # can't unwind into user space anyway
++      pushl_cfi %eax                  # save orig_eax
++      SAVE_ALL
++      GET_THREAD_INFO(%ebp)
++      testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
++      jnz syscall_trace_entry
++      cmpl $(nr_syscalls), %eax
++      jae syscall_badsys
++syscall_call:
++      call *sys_call_table(,%eax,4)
++      movl %eax,PT_EAX(%esp)          # store the return value
++syscall_exit:
++      LOCKDEP_SYS_EXIT
++      DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
++                                      # setting need_resched or sigpending
++                                      # between sampling and the iret
++      TRACE_IRQS_OFF
++      movl TI_flags(%ebp), %ecx
++      testl $_TIF_ALLWORK_MASK, %ecx  # current->work
++      jne syscall_exit_work
++
++restore_all:
++      TRACE_IRQS_IRET
++restore_all_notrace:
++#ifndef CONFIG_XEN
++      movl PT_EFLAGS(%esp), %eax      # mix EFLAGS, SS and CS
++      # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
++      # are returning to the kernel.
++      # See comments in process.c:copy_thread() for details.
++      movb PT_OLDSS(%esp), %ah
++      movb PT_CS(%esp), %al
++      andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
++      cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
++      CFI_REMEMBER_STATE
++      je ldt_ss                       # returning to user-space with LDT SS
++restore_nocheck:
++#else
++restore_nocheck:
++      movl PT_EFLAGS(%esp), %eax
++      testl $(X86_EFLAGS_VM|NMI_MASK), %eax
++      CFI_REMEMBER_STATE
++      jnz hypervisor_iret
++      shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
++      GET_VCPU_INFO
++      andb evtchn_upcall_mask(%esi),%al
++      andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
++      CFI_REMEMBER_STATE
++      jnz restore_all_enable_events   #        != 0 => enable event delivery
++#endif
++      RESTORE_REGS 4                  # skip orig_eax/error_code
++irq_return:
++      INTERRUPT_RETURN
++.section .fixup,"ax"
++ENTRY(iret_exc)
++      pushl $0                        # no error code
++      pushl $do_iret_error
++      jmp error_code
++.previous
++.section __ex_table,"a"
++      .align 4
++      .long irq_return,iret_exc
++.previous
++
++      CFI_RESTORE_STATE
++#ifndef CONFIG_XEN
++ldt_ss:
++      larl PT_OLDSS(%esp), %eax
++      jnz restore_nocheck
++      testl $0x00400000, %eax         # returning to 32bit stack?
++      jnz restore_nocheck             # allright, normal return
++
++#ifdef CONFIG_PARAVIRT
++      /*
++       * The kernel can't run on a non-flat stack if paravirt mode
++       * is active.  Rather than try to fixup the high bits of
++       * ESP, bypass this code entirely.  This may break DOSemu
++       * and/or Wine support in a paravirt VM, although the option
++       * is still available to implement the setting of the high
++       * 16-bits in the INTERRUPT_RETURN paravirt-op.
++       */
++      cmpl $0, pv_info+PARAVIRT_enabled
++      jne restore_nocheck
++#endif
++
++/*
++ * Setup and switch to ESPFIX stack
++ *
++ * We're returning to userspace with a 16 bit stack. The CPU will not
++ * restore the high word of ESP for us on executing iret... This is an
++ * "official" bug of all the x86-compatible CPUs, which we can work
++ * around to make dosemu and wine happy. We do this by preloading the
++ * high word of ESP with the high word of the userspace ESP while
++ * compensating for the offset by changing to the ESPFIX segment with
++ * a base address that matches for the difference.
++ */
++#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
++      mov %esp, %edx                  /* load kernel esp */
++      mov PT_OLDESP(%esp), %eax       /* load userspace esp */
++      mov %dx, %ax                    /* eax: new kernel esp */
++      sub %eax, %edx                  /* offset (low word is 0) */
++      shr $16, %edx
++      mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
++      mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
++      pushl_cfi $__ESPFIX_SS
++      pushl_cfi %eax                  /* new kernel esp */
++      /* Disable interrupts, but do not irqtrace this section: we
++       * will soon execute iret and the tracer was already set to
++       * the irqstate after the iret */
++      DISABLE_INTERRUPTS(CLBR_EAX)
++      lss (%esp), %esp                /* switch to espfix segment */
++      CFI_ADJUST_CFA_OFFSET -8
++      jmp restore_nocheck
++#else
++        ALIGN
++restore_all_enable_events:
++      TRACE_IRQS_ON
++      __ENABLE_INTERRUPTS
++scrit:        /**** START OF CRITICAL REGION ****/
++      __TEST_PENDING
++      jnz  14f                        # process more events if necessary...
++      RESTORE_REGS 4
++1:    INTERRUPT_RETURN
++.section __ex_table,"a"
++      .align 4
++      .long 1b,iret_exc
++.previous
++14:   __DISABLE_INTERRUPTS
++      TRACE_IRQS_OFF
++ecrit:  /**** END OF CRITICAL REGION ****/
++      jmp  .Ldo_upcall
++
++      CFI_RESTORE_STATE
++hypervisor_iret:
++      andl $~NMI_MASK, PT_EFLAGS(%esp)
++      RESTORE_REGS 4
++      jmp  hypercall_page + (__HYPERVISOR_iret * 32)
++#endif
++      CFI_ENDPROC
++ENDPROC(system_call)
++
++      # perform work that needs to be done immediately before resumption
++      ALIGN
++      RING0_PTREGS_FRAME              # can't unwind into user space anyway
++work_pending:
++      testb $_TIF_NEED_RESCHED, %cl
++      jz work_notifysig
++work_resched:
++      call schedule
++      LOCKDEP_SYS_EXIT
++      DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
++                                      # setting need_resched or sigpending
++                                      # between sampling and the iret
++      TRACE_IRQS_OFF
++      movl TI_flags(%ebp), %ecx
++      andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
++                                      # than syscall tracing?
++      jz restore_all
++      testb $_TIF_NEED_RESCHED, %cl
++      jnz work_resched
++
++work_notifysig:                               # deal with pending signals and
++                                      # notify-resume requests
++#ifdef CONFIG_VM86
++      testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
++      movl %esp, %eax
++      jne work_notifysig_v86          # returning to kernel-space or
++                                      # vm86-space
++      xorl %edx, %edx
++      call do_notify_resume
++      jmp resume_userspace_sig
++
++      ALIGN
++work_notifysig_v86:
++      pushl_cfi %ecx                  # save ti_flags for do_notify_resume
++      call save_v86_state             # %eax contains pt_regs pointer
++      popl_cfi %ecx
++      movl %eax, %esp
++#else
++      movl %esp, %eax
++#endif
++      xorl %edx, %edx
++      call do_notify_resume
++      jmp resume_userspace_sig
++END(work_pending)
++
++      # perform syscall exit tracing
++      ALIGN
++syscall_trace_entry:
++      movl $-ENOSYS,PT_EAX(%esp)
++      movl %esp, %eax
++      call syscall_trace_enter
++      /* What it returned is what we'll actually use.  */
++      cmpl $(nr_syscalls), %eax
++      jnae syscall_call
++      jmp syscall_exit
++END(syscall_trace_entry)
++
++      # perform syscall exit tracing
++      ALIGN
++syscall_exit_work:
++      testl $_TIF_WORK_SYSCALL_EXIT, %ecx
++      jz work_pending
++      TRACE_IRQS_ON
++      ENABLE_INTERRUPTS(CLBR_ANY)     # could let syscall_trace_leave() call
++                                      # schedule() instead
++      movl %esp, %eax
++      call syscall_trace_leave
++      jmp resume_userspace
++END(syscall_exit_work)
++      CFI_ENDPROC
++
++      RING0_INT_FRAME                 # can't unwind into user space anyway
++syscall_fault:
++      GET_THREAD_INFO(%ebp)
++      movl $-EFAULT,PT_EAX(%esp)
++      jmp resume_userspace
++END(syscall_fault)
++
++syscall_badsys:
++      movl $-ENOSYS,PT_EAX(%esp)
++      jmp resume_userspace
++END(syscall_badsys)
++      CFI_ENDPROC
++/*
++ * End of kprobes section
++ */
++      .popsection
++
++/*
++ * System calls that need a pt_regs pointer.
++ */
++#define PTREGSCALL0(name) \
++      ALIGN; \
++ptregs_##name: \
++      leal 4(%esp),%eax; \
++      jmp sys_##name;
++
++#define PTREGSCALL1(name) \
++      ALIGN; \
++ptregs_##name: \
++      leal 4(%esp),%edx; \
++      movl (PT_EBX+4)(%esp),%eax; \
++      jmp sys_##name;
++
++#define PTREGSCALL2(name) \
++      ALIGN; \
++ptregs_##name: \
++      leal 4(%esp),%ecx; \
++      movl (PT_ECX+4)(%esp),%edx; \
++      movl (PT_EBX+4)(%esp),%eax; \
++      jmp sys_##name;
++
++#define PTREGSCALL3(name) \
++      ALIGN; \
++ptregs_##name: \
++      CFI_STARTPROC; \
++      leal 4(%esp),%eax; \
++      pushl_cfi %eax; \
++      movl PT_EDX(%eax),%ecx; \
++      movl PT_ECX(%eax),%edx; \
++      movl PT_EBX(%eax),%eax; \
++      call sys_##name; \
++      addl $4,%esp; \
++      CFI_ADJUST_CFA_OFFSET -4; \
++      ret; \
++      CFI_ENDPROC; \
++ENDPROC(ptregs_##name)
++
++PTREGSCALL1(iopl)
++PTREGSCALL0(fork)
++PTREGSCALL0(vfork)
++PTREGSCALL3(execve)
++PTREGSCALL2(sigaltstack)
++PTREGSCALL0(sigreturn)
++PTREGSCALL0(rt_sigreturn)
++PTREGSCALL2(vm86)
++PTREGSCALL1(vm86old)
++
++/* Clone is an oddball.  The 4th arg is in %edi */
++      ALIGN;
++ptregs_clone:
++      CFI_STARTPROC
++      leal 4(%esp),%eax
++      pushl_cfi %eax
++      pushl_cfi PT_EDI(%eax)
++      movl PT_EDX(%eax),%ecx
++      movl PT_ECX(%eax),%edx
++      movl PT_EBX(%eax),%eax
++      call sys_clone
++      addl $8,%esp
++      CFI_ADJUST_CFA_OFFSET -8
++      ret
++      CFI_ENDPROC
++ENDPROC(ptregs_clone)
++
++#ifndef CONFIG_XEN
++.macro FIXUP_ESPFIX_STACK
++/*
++ * Switch back for ESPFIX stack to the normal zerobased stack
++ *
++ * We can't call C functions using the ESPFIX stack. This code reads
++ * the high word of the segment base from the GDT and swiches to the
++ * normal stack and adjusts ESP with the matching offset.
++ */
++      /* fixup the stack */
++      mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
++      mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
++      shl $16, %eax
++      addl %esp, %eax                 /* the adjusted stack pointer */
++      pushl_cfi $__KERNEL_DS
++      pushl_cfi %eax
++      lss (%esp), %esp                /* switch to the normal stack segment */
++      CFI_ADJUST_CFA_OFFSET -8
++.endm
++.macro UNWIND_ESPFIX_STACK
++      movl %ss, %eax
++      /* see if on espfix stack */
++      cmpw $__ESPFIX_SS, %ax
++      jne 27f
++      movl $__KERNEL_DS, %eax
++      movl %eax, %ds
++      movl %eax, %es
++      /* switch to normal stack */
++      FIXUP_ESPFIX_STACK
++27:
++.endm
++
++/*
++ * Build the entry stubs and pointer table with some assembler magic.
++ * We pack 7 stubs into a single 32-byte chunk, which will fit in a
++ * single cache line on all modern x86 implementations.
++ */
++.section .init.rodata,"a"
++ENTRY(interrupt)
++.section .entry.text, "ax"
++      .p2align 5
++      .p2align CONFIG_X86_L1_CACHE_SHIFT
++ENTRY(irq_entries_start)
++      RING0_INT_FRAME
++vector=FIRST_EXTERNAL_VECTOR
++.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
++      .balign 32
++  .rept       7
++    .if vector < NR_VECTORS
++      .if vector <> FIRST_EXTERNAL_VECTOR
++      CFI_ADJUST_CFA_OFFSET -4
++      .endif
++1:    pushl_cfi $(~vector+0x80)       /* Note: always in signed byte range */
++      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
++      jmp 2f
++      .endif
++      .previous
++      .long 1b
++      .section .entry.text, "ax"
++vector=vector+1
++    .endif
++  .endr
++2:    jmp common_interrupt
++.endr
++END(irq_entries_start)
++
++.previous
++END(interrupt)
++.previous
++
++/*
++ * the CPU automatically disables interrupts when executing an IRQ vector,
++ * so IRQ-flags tracing has to follow that:
++ */
++      .p2align CONFIG_X86_L1_CACHE_SHIFT
++common_interrupt:
++      addl $-0x80,(%esp)      /* Adjust vector into the [-256,-1] range */
++      SAVE_ALL
++      TRACE_IRQS_OFF
++      movl %esp,%eax
++      call do_IRQ
++      jmp ret_from_intr
++ENDPROC(common_interrupt)
++      CFI_ENDPROC
++
++/*
++ *  Irq entries should be protected against kprobes
++ */
++      .pushsection .kprobes.text, "ax"
++#define BUILD_INTERRUPT3(name, nr, fn)        \
++ENTRY(name)                           \
++      RING0_INT_FRAME;                \
++      pushl_cfi $~(nr);               \
++      SAVE_ALL;                       \
++      TRACE_IRQS_OFF                  \
++      movl %esp,%eax;                 \
++      call fn;                        \
++      jmp ret_from_intr;              \
++      CFI_ENDPROC;                    \
++ENDPROC(name)
++
++#define BUILD_INTERRUPT(name, nr)     BUILD_INTERRUPT3(name, nr, smp_##name)
++
++/* The include is where all of the SMP etc. interrupts come from */
++#include <asm/entry_arch.h>
++
++#else
++#define UNWIND_ESPFIX_STACK
++
++      .pushsection .kprobes.text, "ax"
++
++# A note on the "critical region" in our callback handler.
++# We want to avoid stacking callback handlers due to events occurring
++# during handling of the last event. To do this, we keep events disabled
++# until we've done all processing. HOWEVER, we must enable events before
++# popping the stack frame (can't be done atomically) and so it would still
++# be possible to get enough handler activations to overflow the stack.
++# Although unlikely, bugs of that kind are hard to track down, so we'd
++# like to avoid the possibility.
++# So, on entry to the handler we detect whether we interrupted an
++# existing activation in its critical region -- if so, we pop the current
++# activation and restart the handler using the previous one.
++#
++# The sysexit critical region is slightly different. sysexit
++# atomically removes the entire stack frame. If we interrupt in the
++# critical region we know that the entire frame is present and correct
++# so we can simply throw away the new one.
++ENTRY(hypervisor_callback)
++      RING0_INT_FRAME
++      pushl_cfi %eax
++      SAVE_ALL
++      movl PT_CS(%esp),%ecx
++      movl PT_EIP(%esp),%eax
++      andl $SEGMENT_RPL_MASK,%ecx
++      cmpl $USER_RPL,%ecx
++      jae  .Ldo_upcall
++      cmpl $scrit,%eax
++      jb   0f
++      cmpl $ecrit,%eax
++      jb   critical_region_fixup
++0:
++#ifdef CONFIG_XEN_SUPERVISOR_MODE_KERNEL
++      cmpl $sysexit_scrit,%eax
++      jb   .Ldo_upcall
++      cmpl $sysexit_ecrit,%eax
++      ja   .Ldo_upcall
++      addl $PT_OLDESP,%esp            # Remove eflags...ebx from stack frame.
++#endif
++.Ldo_upcall:
++      pushl_cfi %esp
++      call evtchn_do_upcall
++      add  $4,%esp
++      CFI_ADJUST_CFA_OFFSET -4
++      jmp  ret_from_intr
++      CFI_ENDPROC
++
++# [How we do the fixup]. We want to merge the current stack frame with the
++# just-interrupted frame. How we do this depends on where in the critical
++# region the interrupted handler was executing, and so how many saved
++# registers are in each frame. We do this quickly using the lookup table
++# 'critical_fixup_table'. For each byte offset in the critical region, it
++# provides the number of bytes which have already been popped from the
++# interrupted stack frame.
++critical_region_fixup:
++      movsbl critical_fixup_table-scrit(%eax),%ecx # %ecx contains num slots popped
++      testl %ecx,%ecx
++      leal (%esp,%ecx,4),%esi         # %esi points at end of src region
++      leal PT_OLDESP(%esp),%edi       # %edi points at end of dst region
++      jle   17f                       # skip loop if nothing to copy
++16:   subl $4,%esi                    # pre-decrementing copy loop
++      subl $4,%edi
++      movl (%esi),%eax
++      movl %eax,(%edi)
++      loop 16b
++17:   movl %edi,%esp                  # final %edi is top of merged stack
++      jmp  .Ldo_upcall
++
++.section .rodata,"a"
++critical_fixup_table:
++      .rept __SIZEOF_TEST_PENDING
++      .byte -1
++      .endr
++      .byte -1,-1                     # jnz  14f
++      .byte 0                         # pop  %ebx
++      .byte 1                         # pop  %ecx
++      .byte 2                         # pop  %edx
++      .byte 3                         # pop  %esi
++      .byte 4                         # pop  %edi
++      .byte 5                         # pop  %ebp
++      .byte 6                         # pop  %eax
++      .byte 7                         # pop  %ds
++      .byte 8                         # pop  %es
++      .byte 9,9                       # pop  %fs
++#ifndef CONFIG_X86_32_LAZY_GS
++      .byte 10,10                     # pop  %gs
++      .byte 11,11,11                  # add  $4,%esp
++#else
++      .byte 10,10,10                  # add  $8,%esp
++#endif
++      .byte 12                        # iret
++      .rept __SIZEOF_DISABLE_INTERRUPTS
++      .byte -1
++      .endr
++.previous
++
++# Hypervisor uses this for application faults while it executes.
++# We get here for two reasons:
++#  1. Fault while reloading DS, ES, FS or GS
++#  2. Fault while executing IRET
++# Category 1 we fix up by reattempting the load, and zeroing the segment
++# register if the load fails.
++# Category 2 we fix up by jumping to do_iret_error. We cannot use the
++# normal Linux return path in this case because if we use the IRET hypercall
++# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
++# We distinguish between categories by maintaining a status value in EAX.
++ENTRY(failsafe_callback)
++      pushl %eax
++      movl $1,%eax
++1:    mov 4(%esp),%ds
++2:    mov 8(%esp),%es
++3:    mov 12(%esp),%fs
++4:    mov 16(%esp),%gs
++      testl %eax,%eax
++      popl %eax
++      jz 5f
++      addl $16,%esp           # EAX != 0 => Category 2 (Bad IRET)
++      jmp iret_exc
++5:    addl $16,%esp           # EAX == 0 => Category 1 (Bad segment)
++      RING0_INT_FRAME
++      pushl $0
++      SAVE_ALL
++      jmp ret_from_exception
++.section .fixup,"ax";         \
++6:    xorl %eax,%eax;         \
++      movl %eax,4(%esp);      \
++      jmp 1b;                 \
++7:    xorl %eax,%eax;         \
++      movl %eax,8(%esp);      \
++      jmp 2b;                 \
++8:    xorl %eax,%eax;         \
++      movl %eax,12(%esp);     \
++      jmp 3b;                 \
++9:    xorl %eax,%eax;         \
++      movl %eax,16(%esp);     \
++      jmp 4b;                 \
++.previous;                    \
++.section __ex_table,"a";      \
++      .align 4;               \
++      .long 1b,6b;            \
++      .long 2b,7b;            \
++      .long 3b,8b;            \
++      .long 4b,9b;            \
++.previous
++#endif
++      CFI_ENDPROC
++
++ENTRY(coprocessor_error)
++      RING0_INT_FRAME
++      pushl_cfi $0
++      pushl_cfi $do_coprocessor_error
++      jmp error_code
++      CFI_ENDPROC
++END(coprocessor_error)
++
++ENTRY(simd_coprocessor_error)
++      RING0_INT_FRAME
++      pushl_cfi $0
++#ifdef CONFIG_X86_INVD_BUG
++      /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
++661:  pushl_cfi $do_general_protection
++662:
++.section .altinstructions,"a"
++      .balign 4
++      .long 661b
++      .long 663f
++      .word X86_FEATURE_XMM
++      .byte 662b-661b
++      .byte 664f-663f
++.previous
++.section .altinstr_replacement,"ax"
++663:  pushl $do_simd_coprocessor_error
++664:
++.previous
++#else
++      pushl_cfi $do_simd_coprocessor_error
++#endif
++      jmp error_code
++      CFI_ENDPROC
++END(simd_coprocessor_error)
++
++ENTRY(device_not_available)
++      RING0_INT_FRAME
++      pushl_cfi $-1                   # mark this as an int
++      pushl_cfi $do_device_not_available
++      jmp error_code
++      CFI_ENDPROC
++END(device_not_available)
++
++#ifdef CONFIG_PARAVIRT
++ENTRY(native_iret)
++      iret
++.section __ex_table,"a"
++      .align 4
++      .long native_iret, iret_exc
++.previous
++END(native_iret)
++
++ENTRY(native_irq_enable_sysexit)
++      sti
++      sysexit
++END(native_irq_enable_sysexit)
++#endif
++
++ENTRY(overflow)
++      RING0_INT_FRAME
++      pushl_cfi $0
++      pushl_cfi $do_overflow
++      jmp error_code
++      CFI_ENDPROC
++END(overflow)
++
++ENTRY(bounds)
++      RING0_INT_FRAME
++      pushl_cfi $0
++      pushl_cfi $do_bounds
++      jmp error_code
++      CFI_ENDPROC
++END(bounds)
++
++ENTRY(invalid_op)
++      RING0_INT_FRAME
++      pushl_cfi $0
++      pushl_cfi $do_invalid_op
++      jmp error_code
++      CFI_ENDPROC
++END(invalid_op)
++
++ENTRY(coprocessor_segment_overrun)
++      RING0_INT_FRAME
++      pushl_cfi $0
++      pushl_cfi $do_coprocessor_segment_overrun
++      jmp error_code
++      CFI_ENDPROC
++END(coprocessor_segment_overrun)
++
++ENTRY(invalid_TSS)
++      RING0_EC_FRAME
++      pushl_cfi $do_invalid_TSS
++      jmp error_code
++      CFI_ENDPROC
++END(invalid_TSS)
++
++ENTRY(segment_not_present)
++      RING0_EC_FRAME
++      pushl_cfi $do_segment_not_present
++      jmp error_code
++      CFI_ENDPROC
++END(segment_not_present)
++
++ENTRY(stack_segment)
++      RING0_EC_FRAME
++      pushl_cfi $do_stack_segment
++      jmp error_code
++      CFI_ENDPROC
++END(stack_segment)
++
++ENTRY(alignment_check)
++      RING0_EC_FRAME
++      pushl_cfi $do_alignment_check
++      jmp error_code
++      CFI_ENDPROC
++END(alignment_check)
++
++ENTRY(divide_error)
++      RING0_INT_FRAME
++      pushl_cfi $0                    # no error code
++      pushl_cfi $do_divide_error
++      jmp error_code
++      CFI_ENDPROC
++END(divide_error)
++
++#ifdef CONFIG_X86_MCE
++ENTRY(machine_check)
++      RING0_INT_FRAME
++      pushl_cfi $0
++      pushl_cfi machine_check_vector
++      jmp error_code
++      CFI_ENDPROC
++END(machine_check)
++#endif
++
++#ifndef CONFIG_XEN
++ENTRY(spurious_interrupt_bug)
++      RING0_INT_FRAME
++      pushl_cfi $0
++      pushl_cfi $do_spurious_interrupt_bug
++      jmp error_code
++      CFI_ENDPROC
++END(spurious_interrupt_bug)
++#endif /* !CONFIG_XEN */
++
++ENTRY(fixup_4gb_segment)
++      RING0_EC_FRAME
++      pushl_cfi $do_fixup_4gb_segment
++      jmp error_code
++      CFI_ENDPROC
++END(fixup_4gb_segment)
++/*
++ * End of kprobes section
++ */
++      .popsection
++
++#ifdef CONFIG_STACK_UNWIND
++ENTRY(arch_unwind_init_running)
++      CFI_STARTPROC
++      movl    4(%esp), %edx
++      movl    (%esp), %ecx
++      leal    4(%esp), %eax
++      movl    %ebx, PT_EBX(%edx)
++      xorl    %ebx, %ebx
++      movl    %ebx, PT_ECX(%edx)
++      movl    %ebx, PT_EDX(%edx)
++      movl    %esi, PT_ESI(%edx)
++      movl    %edi, PT_EDI(%edx)
++      movl    %ebp, PT_EBP(%edx)
++      movl    %ebx, PT_EAX(%edx)
++      movl    $__USER_DS, PT_DS(%edx)
++      movl    $__USER_DS, PT_ES(%edx)
++      movl    $__KERNEL_PERCPU, PT_FS(%edx)
++      movl    $__KERNEL_STACK_CANARY, PT_GS(%edx)
++      movl    %eax, PT_OLDESP(%edx)
++      movl    16(%esp), %eax
++      movl    %ebx, PT_ORIG_EAX(%edx)
++      movl    %ecx, PT_EIP(%edx)
++      movl    12(%esp), %ecx
++      movl    $__KERNEL_CS, PT_CS(%edx)
++      movl    %eax, 12(%esp)
++      movl    8(%esp), %eax
++      movl    %ecx, 8(%esp)
++      movl    %ebx, PT_EFLAGS(%edx)
++      movl    PT_EBX(%edx), %ebx
++      movl    $__KERNEL_DS, PT_OLDSS(%edx)
++      jmpl    *%eax
++      CFI_ENDPROC
++ENDPROC(arch_unwind_init_running)
++#endif
++
++ENTRY(kernel_thread_helper)
++      pushl $0                # fake return address for unwinder
++      CFI_STARTPROC
++      movl %edi,%eax
++      call *%esi
++      call do_exit
++      ud2                     # padding for call trace
++      CFI_ENDPROC
++ENDPROC(kernel_thread_helper)
++
++#ifdef CONFIG_FUNCTION_TRACER
++#ifdef CONFIG_DYNAMIC_FTRACE
++
++ENTRY(mcount)
++      ret
++END(mcount)
++
++ENTRY(ftrace_caller)
++      cmpl $0, function_trace_stop
++      jne  ftrace_stub
++
++      pushl %eax
++      pushl %ecx
++      pushl %edx
++      movl 0xc(%esp), %eax
++      movl 0x4(%ebp), %edx
++      subl $MCOUNT_INSN_SIZE, %eax
++
++.globl ftrace_call
++ftrace_call:
++      call ftrace_stub
++
++      popl %edx
++      popl %ecx
++      popl %eax
++#ifdef CONFIG_FUNCTION_GRAPH_TRACER
++.globl ftrace_graph_call
++ftrace_graph_call:
++      jmp ftrace_stub
++#endif
++
++.globl ftrace_stub
++ftrace_stub:
++      ret
++END(ftrace_caller)
++
++#else /* ! CONFIG_DYNAMIC_FTRACE */
++
++ENTRY(mcount)
++      cmpl $0, function_trace_stop
++      jne  ftrace_stub
++
++      cmpl $ftrace_stub, ftrace_trace_function
++      jnz trace
++#ifdef CONFIG_FUNCTION_GRAPH_TRACER
++      cmpl $ftrace_stub, ftrace_graph_return
++      jnz ftrace_graph_caller
++
++      cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
++      jnz ftrace_graph_caller
++#endif
++.globl ftrace_stub
++ftrace_stub:
++      ret
++
++      /* taken from glibc */
++trace:
++      pushl %eax
++      pushl %ecx
++      pushl %edx
++      movl 0xc(%esp), %eax
++      movl 0x4(%ebp), %edx
++      subl $MCOUNT_INSN_SIZE, %eax
++
++      call *ftrace_trace_function
++
++      popl %edx
++      popl %ecx
++      popl %eax
++      jmp ftrace_stub
++END(mcount)
++#endif /* CONFIG_DYNAMIC_FTRACE */
++#endif /* CONFIG_FUNCTION_TRACER */
++
++#ifdef CONFIG_FUNCTION_GRAPH_TRACER
++ENTRY(ftrace_graph_caller)
++      cmpl $0, function_trace_stop
++      jne ftrace_stub
++
++      pushl %eax
++      pushl %ecx
++      pushl %edx
++      movl 0xc(%esp), %edx
++      lea 0x4(%ebp), %eax
++      movl (%ebp), %ecx
++      subl $MCOUNT_INSN_SIZE, %edx
++      call prepare_ftrace_return
++      popl %edx
++      popl %ecx
++      popl %eax
++      ret
++END(ftrace_graph_caller)
++
++.globl return_to_handler
++return_to_handler:
++      pushl %eax
++      pushl %edx
++      movl %ebp, %eax
++      call ftrace_return_to_handler
++      movl %eax, %ecx
++      popl %edx
++      popl %eax
++      jmp *%ecx
++#endif
++
++#include <asm/alternative-asm.h>
++
++      # pv syscall call handler stub
++ENTRY(ia32pv_cstar_target)
++      RING0_INT_FRAME
++      movl $__USER_DS,16(%esp)
++      movl %ebp,%ecx
++      movl $__USER_CS,4(%esp)
++      movl 12(%esp),%ebp
++      pushl_cfi %eax                  # save orig_eax
++/*
++ * Load the potential sixth argument from user stack.
++ * Careful about security.
++ */
++      cmpl $__PAGE_OFFSET-4,%ebp
++      CFI_REMEMBER_STATE
++      ja cstar_fault
++1:    movl (%ebp),%ebp
++.section __ex_table,"a"
++      .align 4
++      .long 1b,cstar_fault
++.previous
++      SAVE_ALL
++      GET_THREAD_INFO(%ebp)
++      testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
++      jnz cstar_trace_entry
++      cmpl $nr_syscalls,%eax
++      jae cstar_badsys
++.Lcstar_call:
++      btl %eax,cstar_special
++      jc .Lcstar_special
++      call *cstar_call_table(,%eax,4)
++      movl %eax,PT_EAX(%esp)          # store the return value
++.Lcstar_exit:
++      movl PT_ECX(%esp),%ecx
++      movl %ecx,PT_EBP(%esp)          # put user EBP back in place
++      jmp syscall_exit
++.Lcstar_special:
++      movl PT_ECX(%esp),%ecx
++      movl %ecx,PT_EBP(%esp)          # put user EBP back in place
++      jmp syscall_call
++cstar_set_tif:
++      movl $cstar_clear_tif,(%esp)    # replace return address
++      LOCK_PREFIX
++      orl $_TIF_CSTAR,TI_flags(%ebp)
++      jmp *sys_call_table(,%eax,4)
++cstar_clear_tif:
++      movl %eax,PT_EAX(%esp)          # store the return value
++      LOCK_PREFIX
++      andl $~_TIF_CSTAR,TI_flags(%ebp)
++      jmp .Lcstar_exit
++cstar_trace_entry:
++      movl $-ENOSYS,PT_EAX(%esp)
++      cmpl $nr_syscalls,%eax
++      jae 1f
++      btl %eax,cstar_special
++      jc .Lcstar_trace_special
++1:    movl %esp,%eax
++      LOCK_PREFIX
++      orl $_TIF_CSTAR,TI_flags(%ebp)
++      call syscall_trace_enter
++      LOCK_PREFIX
++      andl $~_TIF_CSTAR,TI_flags(%ebp)
++      /* What it returned is what we'll actually use.  */
++      cmpl $nr_syscalls,%eax
++      jb .Lcstar_call
++      jmp .Lcstar_exit
++.Lcstar_trace_special:
++      movl PT_ECX(%esp),%ecx
++      movl %esp,%eax
++      movl %ecx,PT_EBP(%esp)          # put user EBP back in place
++      call syscall_trace_enter
++      /* What it returned is what we'll actually use.  */
++      cmpl $nr_syscalls,%eax
++      jb syscall_call
++      jmp syscall_exit
++cstar_badsys:
++      movl $-ENOSYS,PT_EAX(%esp)
++.Lcstar_resume:
++      movl PT_ECX(%esp),%ecx
++      movl %ecx,PT_EBP(%esp)          # put user EBP back in place
++      jmp resume_userspace
++      CFI_RESTORE_STATE
++cstar_fault:
++      movl $-EFAULT,%eax
++      SAVE_ALL
++      GET_THREAD_INFO(%ebp)
++      jmp .Lcstar_resume
++      CFI_ENDPROC
++ENDPROC(ia32pv_cstar_target)
++
++ENTRY(cstar_ret_from_fork)
++      CFI_STARTPROC
++      movl PT_ECX(%esp),%ecx
++      GET_THREAD_INFO(%ebp)
++      movl %ecx,PT_EBP(%esp)          # put user EBP back in place
++      LOCK_PREFIX
++      andl $~_TIF_CSTAR,TI_flags(%ebp)
++      jmp ret_from_fork
++      CFI_ENDPROC
++END(ret_from_fork)
++
++.section .rodata,"a"
++#include "syscall_table_32.S"
++
++syscall_table_size=(.-sys_call_table)
++
++#include <asm/unistd.h>
++cstar_special:
++nr=0
++mask=0
++.rept nr_syscalls+31
++ .irp n, __NR_sigreturn, __NR_rt_sigreturn
++  .if nr == \n
++   mask = mask | (1 << (\n & 31))
++  .endif
++ .endr
++ nr = nr + 1
++ .if (nr & 31) == 0
++  .long mask
++  mask = 0
++ .endif
++.endr
++#define       sys_call_table cstar_call_table
++#define       sys_fork cstar_set_tif
++#define       sys_clone cstar_set_tif
++#define       sys_vfork cstar_set_tif
++#include "syscall_table_32.S"
++#undef        sys_call_table
++#undef        sys_fork
++#undef        sys_clone
++#undef        sys_vfork
++
++/*
++ * Some functions should be protected against kprobes
++ */
++      .pushsection .kprobes.text, "ax"
++
++ENTRY(page_fault)
++      RING0_EC_FRAME
++      pushl_cfi $do_page_fault
++      ALIGN
++error_code:
++      /* the function address is in %gs's slot on the stack */
++      pushl_cfi %fs
++      /*CFI_REL_OFFSET fs, 0*/
++      pushl_cfi %es
++      /*CFI_REL_OFFSET es, 0*/
++      pushl_cfi %ds
++      /*CFI_REL_OFFSET ds, 0*/
++      pushl_cfi %eax
++      CFI_REL_OFFSET eax, 0
++      pushl_cfi %ebp
++      CFI_REL_OFFSET ebp, 0
++      pushl_cfi %edi
++      CFI_REL_OFFSET edi, 0
++      pushl_cfi %esi
++      CFI_REL_OFFSET esi, 0
++      pushl_cfi %edx
++      CFI_REL_OFFSET edx, 0
++      pushl_cfi %ecx
++      CFI_REL_OFFSET ecx, 0
++      pushl_cfi %ebx
++      CFI_REL_OFFSET ebx, 0
++      cld
++      movl $(__KERNEL_PERCPU), %ecx
++      movl %ecx, %fs
++      UNWIND_ESPFIX_STACK
++      GS_TO_REG %ecx
++      movl PT_GS(%esp), %edi          # get the function address
++      movl PT_ORIG_EAX(%esp), %edx    # get the error code
++      movl $-1, PT_ORIG_EAX(%esp)     # no syscall to restart
++      REG_TO_PTGS %ecx
++      SET_KERNEL_GS %ecx
++      movl $(__USER_DS), %ecx
++      movl %ecx, %ds
++      movl %ecx, %es
++      TRACE_IRQS_OFF
++      movl %esp,%eax                  # pt_regs pointer
++      call *%edi
++      jmp ret_from_exception
++      CFI_ENDPROC
++END(page_fault)
++
++#ifndef CONFIG_XEN
++/*
++ * Debug traps and NMI can happen at the one SYSENTER instruction
++ * that sets up the real kernel stack. Check here, since we can't
++ * allow the wrong stack to be used.
++ *
++ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
++ * already pushed 3 words if it hits on the sysenter instruction:
++ * eflags, cs and eip.
++ *
++ * We just load the right stack, and push the three (known) values
++ * by hand onto the new stack - while updating the return eip past
++ * the instruction that would have done it for sysenter.
++ */
++.macro FIX_STACK offset ok label
++      cmpw $__KERNEL_CS, 4(%esp)
++      jne \ok
++\label:
++      movl TSS_sysenter_sp0 + \offset(%esp), %esp
++      CFI_DEF_CFA esp, 0
++      CFI_UNDEFINED eip
++      pushfl_cfi
++      pushl_cfi $__KERNEL_CS
++      pushl_cfi $sysenter_past_esp
++      CFI_REL_OFFSET eip, 0
++.endm
++#endif /* CONFIG_XEN */
++
++ENTRY(debug)
++      RING0_INT_FRAME
++#ifndef CONFIG_XEN
++      cmpl $ia32_sysenter_target,(%esp)
++      jne debug_stack_correct
++      FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
++debug_stack_correct:
++#endif /* !CONFIG_XEN */
++      pushl_cfi $-1                   # mark this as an int
++      SAVE_ALL
++      TRACE_IRQS_OFF
++      xorl %edx,%edx                  # error code 0
++      movl %esp,%eax                  # pt_regs pointer
++      call do_debug
++      jmp ret_from_exception
++      CFI_ENDPROC
++END(debug)
++
++/*
++ * NMI is doubly nasty. It can happen _while_ we're handling
++ * a debug fault, and the debug fault hasn't yet been able to
++ * clear up the stack. So we first check whether we got  an
++ * NMI on the sysenter entry path, but after that we need to
++ * check whether we got an NMI on the debug path where the debug
++ * fault happened on the sysenter path.
++ */
++ENTRY(nmi)
++      RING0_INT_FRAME
++      pushl_cfi %eax
++#ifndef CONFIG_XEN
++      movl %ss, %eax
++      cmpw $__ESPFIX_SS, %ax
++      popl_cfi %eax
++      je nmi_espfix_stack
++      cmpl $ia32_sysenter_target,(%esp)
++      je nmi_stack_fixup
++      pushl_cfi %eax
++      movl %esp,%eax
++      /* Do not access memory above the end of our stack page,
++       * it might not exist.
++       */
++      andl $(THREAD_SIZE-1),%eax
++      cmpl $(THREAD_SIZE-20),%eax
++      popl_cfi %eax
++      jae nmi_stack_correct
++      cmpl $ia32_sysenter_target,12(%esp)
++      je nmi_debug_stack_check
++nmi_stack_correct:
++      /* We have a RING0_INT_FRAME here */
++      pushl_cfi %eax
++      SAVE_ALL
++      xorl %edx,%edx          # zero error code
++      movl %esp,%eax          # pt_regs pointer
++      call do_nmi
++      jmp restore_all_notrace
++      CFI_ENDPROC
++
++nmi_stack_fixup:
++      RING0_INT_FRAME
++      FIX_STACK 12, nmi_stack_correct, 1
++      jmp nmi_stack_correct
++
++nmi_debug_stack_check:
++      /* We have a RING0_INT_FRAME here */
++      cmpw $__KERNEL_CS,16(%esp)
++      jne nmi_stack_correct
++      cmpl $debug,(%esp)
++      jb nmi_stack_correct
++      cmpl $debug_esp_fix_insn,(%esp)
++      ja nmi_stack_correct
++      FIX_STACK 24, nmi_stack_correct, 1
++      jmp nmi_stack_correct
++
++nmi_espfix_stack:
++      /* We have a RING0_INT_FRAME here.
++       *
++       * create the pointer to lss back
++       */
++      pushl_cfi %ss
++      pushl_cfi %esp
++      addl $4, (%esp)
++      /* copy the iret frame of 12 bytes */
++      .rept 3
++      pushl_cfi 16(%esp)
++      .endr
++      pushl_cfi %eax
++      SAVE_ALL
++      FIXUP_ESPFIX_STACK              # %eax == %esp
++      xorl %edx,%edx                  # zero error code
++      call do_nmi
++      RESTORE_REGS
++      lss 12+4(%esp), %esp            # back to espfix stack
++      CFI_ADJUST_CFA_OFFSET -24
++      jmp irq_return
++#else
++      SAVE_ALL
++      xorl %edx,%edx          # zero error code
++      movl %esp,%eax          # pt_regs pointer
++      call do_nmi
++      orl  $NMI_MASK, PT_EFLAGS(%esp)
++      jmp restore_all
++#endif
++      CFI_ENDPROC
++END(nmi)
++
++ENTRY(int3)
++      RING0_INT_FRAME
++      pushl_cfi $-1                   # mark this as an int
++      SAVE_ALL
++      TRACE_IRQS_OFF
++      xorl %edx,%edx          # zero error code
++      movl %esp,%eax          # pt_regs pointer
++      call do_int3
++      jmp ret_from_exception
++      CFI_ENDPROC
++END(int3)
++
++ENTRY(general_protection)
++      RING0_EC_FRAME
++      pushl_cfi $do_general_protection
++      jmp error_code
++      CFI_ENDPROC
++END(general_protection)
++
++#ifdef CONFIG_KVM_GUEST
++ENTRY(async_page_fault)
++      RING0_EC_FRAME
++      pushl_cfi $do_async_page_fault
++      jmp error_code
++      CFI_ENDPROC
++END(async_page_fault)
++#endif
++
++/*
++ * End of kprobes section
++ */
++      .popsection
diff --cc arch/x86/kernel/entry_32.S

index 7a494dd,5c1a919..dccfd9c
--- 1/arch/x86/kernel/entry_32.S
--- 2/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@@ -377,7 -377,7 +377,7 @@@ ENTRY(ia32_sysenter_target
         CFI_SIGNAL_FRAME
         CFI_DEF_CFA esp, 0
         CFI_REGISTER esp, ebp
--      movl TSS_sysenter_sp0(%esp),%esp
++      movl SYSENTER_stack_sp0(%esp),%esp
   sysenter_past_esp:
         /*
          * Interrupts are disabled here, but we can't trace it until
@@@ -1049,7 -1014,7 +1049,7 @@@ ENTRY(kernel_thread_helper
         CFI_ENDPROC
   ENDPROC(kernel_thread_helper)
   
--#ifdef CONFIG_XEN
++#ifdef CONFIG_PARAVIRT_XEN
   /* Xen doesn't set %esp to be precisely what the normal sysenter
      entrypoint expects, so fix it up before using the normal path. */
   ENTRY(xen_sysenter_target)
@@@ -1141,7 -1106,7 +1141,7 @@@ ENDPROC(xen_failsafe_callback
   BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
                 xen_evtchn_do_upcall)
   
--#endif        /* CONFIG_XEN */
++#endif        /* CONFIG_PARAVIRT_XEN */
   
   #ifdef CONFIG_FUNCTION_TRACER
   #ifdef CONFIG_DYNAMIC_FTRACE
@@@ -1309,7 -1274,7 +1309,7 @@@ END(page_fault
    * that sets up the real kernel stack. Check here, since we can't
    * allow the wrong stack to be used.
    *
-- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
++ * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
    * already pushed 3 words if it hits on the sysenter instruction:
    * eflags, cs and eip.
    *
@@@ -1321,7 -1286,7 +1321,7 @@@
         cmpw $__KERNEL_CS, 4(%esp)
         jne \ok
   \label:
--      movl TSS_sysenter_sp0 + \offset(%esp), %esp
++      movl SYSENTER_stack_sp0 + \offset(%esp), %esp
         CFI_DEF_CFA esp, 0
         CFI_UNDEFINED eip
         pushfl_cfi
diff --cc arch/x86/kernel/entry_64-xen.S

index 0000000,0000000..0b4c1c4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/entry_64-xen.S
@@@ -1,0 -1,0 +1,1368 @@@
++/*
++ *  linux/arch/x86_64/entry.S
++ *
++ *  Copyright (C) 1991, 1992  Linus Torvalds
++ *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
++ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
++ *  Jun Nakajima <jun.nakajima@intel.com>
++ *  Asit Mallick <asit.k.mallick@intel.com>
++ *      Modified for Xen
++ */
++
++/*
++ * entry.S contains the system-call and fault low-level handling routines.
++ *
++ * NOTE: This code handles signal-recognition, which happens every time
++ * after an interrupt and after each system call.
++ *
++ * Normal syscalls and interrupts don't save a full stack frame, this is
++ * only done for syscall tracing, signals or fork/exec et.al.
++ *
++ * A note on terminology:
++ * - top of stack: Architecture defined interrupt frame from SS to RIP
++ * at the top of the kernel process stack.
++ * - partial stack frame: partially saved registers up to R11.
++ * - full stack frame: Like partial stack frame, but all register saved.
++ *
++ * Some macro usage:
++ * - CFI macros are used to generate dwarf2 unwind information for better
++ * backtraces. They don't change any code.
++ * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
++ * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
++ * There are unfortunately lots of special cases where some registers
++ * not touched. The macro is a big mess that should be cleaned up.
++ * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
++ * Gives a full stack frame.
++ * - ENTRY/END Define functions in the symbol table.
++ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
++ * frame that is otherwise undefined after a SYSCALL
++ * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
++ * - errorentry/paranoidentry/zeroentry - Define exception entry points.
++ */
++
++#include <linux/linkage.h>
++#include <asm/segment.h>
++#include <asm/cache.h>
++#include <asm/errno.h>
++#include <asm/dwarf2.h>
++#include <asm/calling.h>
++#include <asm/asm-offsets.h>
++#include <asm/msr.h>
++#include <asm/unistd.h>
++#include <asm/thread_info.h>
++#include <asm/hw_irq.h>
++#include <asm/page_types.h>
++#include <asm/irqflags.h>
++#include <asm/processor-flags.h>
++#include <asm/ftrace.h>
++#include <asm/percpu.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/features.h>
++
++/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
++#include <linux/elf-em.h>
++#define AUDIT_ARCH_X86_64     (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
++#define __AUDIT_ARCH_64BIT 0x80000000
++#define __AUDIT_ARCH_LE          0x40000000
++
++      .code64
++      .section .entry.text, "ax"
++
++#ifdef CONFIG_FUNCTION_TRACER
++#ifdef CONFIG_DYNAMIC_FTRACE
++ENTRY(mcount)
++      retq
++END(mcount)
++
++ENTRY(ftrace_caller)
++      cmpl $0, function_trace_stop
++      jne  ftrace_stub
++
++      MCOUNT_SAVE_FRAME
++
++      movq 0x38(%rsp), %rdi
++      movq 8(%rbp), %rsi
++      subq $MCOUNT_INSN_SIZE, %rdi
++
++GLOBAL(ftrace_call)
++      call ftrace_stub
++
++      MCOUNT_RESTORE_FRAME
++
++#ifdef CONFIG_FUNCTION_GRAPH_TRACER
++GLOBAL(ftrace_graph_call)
++      jmp ftrace_stub
++#endif
++
++GLOBAL(ftrace_stub)
++      retq
++END(ftrace_caller)
++
++#else /* ! CONFIG_DYNAMIC_FTRACE */
++ENTRY(mcount)
++      cmpl $0, function_trace_stop
++      jne  ftrace_stub
++
++      cmpq $ftrace_stub, ftrace_trace_function
++      jnz trace
++
++#ifdef CONFIG_FUNCTION_GRAPH_TRACER
++      cmpq $ftrace_stub, ftrace_graph_return
++      jnz ftrace_graph_caller
++
++      cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
++      jnz ftrace_graph_caller
++#endif
++
++GLOBAL(ftrace_stub)
++      retq
++
++trace:
++      MCOUNT_SAVE_FRAME
++
++      movq 0x38(%rsp), %rdi
++      movq 8(%rbp), %rsi
++      subq $MCOUNT_INSN_SIZE, %rdi
++
++      call   *ftrace_trace_function
++
++      MCOUNT_RESTORE_FRAME
++
++      jmp ftrace_stub
++END(mcount)
++#endif /* CONFIG_DYNAMIC_FTRACE */
++#endif /* CONFIG_FUNCTION_TRACER */
++
++#ifdef CONFIG_FUNCTION_GRAPH_TRACER
++ENTRY(ftrace_graph_caller)
++      cmpl $0, function_trace_stop
++      jne ftrace_stub
++
++      MCOUNT_SAVE_FRAME
++
++      leaq 8(%rbp), %rdi
++      movq 0x38(%rsp), %rsi
++      movq (%rbp), %rdx
++      subq $MCOUNT_INSN_SIZE, %rsi
++
++      call    prepare_ftrace_return
++
++      MCOUNT_RESTORE_FRAME
++
++      retq
++END(ftrace_graph_caller)
++
++GLOBAL(return_to_handler)
++      subq  $24, %rsp
++
++      /* Save the return values */
++      movq %rax, (%rsp)
++      movq %rdx, 8(%rsp)
++      movq %rbp, %rdi
++
++      call ftrace_return_to_handler
++
++      movq %rax, %rdi
++      movq 8(%rsp), %rdx
++      movq (%rsp), %rax
++      addq $24, %rsp
++      jmp *%rdi
++#endif
++
++
++#ifndef CONFIG_PREEMPT
++#define retint_kernel retint_restore_args
++#endif
++
++
++.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
++#ifdef CONFIG_TRACE_IRQFLAGS
++      bt   $9,EFLAGS-\offset(%rsp)    /* interrupts off? */
++      jnc  1f
++      TRACE_IRQS_ON
++1:
++#endif
++.endm
++
++NMI_MASK = 0x80000000
++      
++/*
++ * C code is not supposed to know about undefined top of stack. Every time
++ * a C function with an pt_regs argument is called from the SYSCALL based
++ * fast path FIXUP_TOP_OF_STACK is needed.
++ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
++ * manipulation.
++ */
++
++      /* %rsp:at FRAMEEND */
++      .macro FIXUP_TOP_OF_STACK tmp offset=0
++      movq $__USER_CS,CS+\offset(%rsp)
++      movq $-1,RCX+\offset(%rsp)
++      .endm
++
++      .macro RESTORE_TOP_OF_STACK tmp offset=0
++      .endm
++
++      .macro FAKE_STACK_FRAME child_rip
++      /* push in order ss, rsp, eflags, cs, rip */
++      xorl %eax, %eax
++      pushq_cfi $__KERNEL_DS /* ss */
++      /*CFI_REL_OFFSET        ss,0*/
++      pushq_cfi %rax /* rsp */
++      CFI_REL_OFFSET  rsp,0
++      pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
++      /*CFI_REL_OFFSET        rflags,0*/
++      pushq_cfi $__KERNEL_CS /* cs */
++      /*CFI_REL_OFFSET        cs,0*/
++      pushq_cfi \child_rip /* rip */
++      CFI_REL_OFFSET  rip,0
++      pushq_cfi %rax /* orig rax */
++      .endm
++
++      .macro UNFAKE_STACK_FRAME
++      addq $8*6, %rsp
++      CFI_ADJUST_CFA_OFFSET   -(6*8)
++      .endm
++
++/*
++ * initial frame state for syscall
++ */
++      .macro BASIC_FRAME start=1 offset=0
++      .if \start
++      CFI_STARTPROC simple
++      CFI_SIGNAL_FRAME
++      CFI_DEF_CFA rsp, SS+8+\offset-RIP
++      .else
++      CFI_DEF_CFA_OFFSET SS+8+\offset-RIP
++      .endif
++      /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
++      CFI_REL_OFFSET rsp, RSP+\offset-RIP
++      /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
++      /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
++      CFI_REL_OFFSET rip, RIP+\offset-RIP
++      .endm
++
++/*
++ * initial frame state for interrupts (and exceptions without error code)
++ */
++      .macro INTR_FRAME start=1 offset=0
++      .if \start == 1
++      BASIC_FRAME 1, \offset+2*8
++      CFI_REL_OFFSET rcx, 0+\offset
++      CFI_REL_OFFSET r11, 8+\offset
++      .else
++      BASIC_FRAME \start, \offset
++      .endif
++      .endm
++
++/*
++ * initial frame state for exceptions with error code (and interrupts
++ * with vector already pushed)
++ */
++      .macro XCPT_FRAME start=1 offset=0
++      INTR_FRAME \start, RIP+\offset-ORIG_RAX
++      .endm
++
++/*
++ * frame that enables calling into C.
++ */
++      .macro PARTIAL_FRAME start=1 offset=0
++      .if \start >= 0
++      XCPT_FRAME 2*\start, ORIG_RAX+\offset-ARGOFFSET
++      .endif
++      CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
++      CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
++      CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
++      CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
++      CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
++      CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
++      CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
++      CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
++      CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
++      .endm
++
++/*
++ * frame that enables passing a complete pt_regs to a C function.
++ */
++      .macro DEFAULT_FRAME start=1 offset=0
++      .if \start >= -1
++      PARTIAL_FRAME \start, R11+\offset-R15
++      .endif
++      CFI_REL_OFFSET rbx, RBX+\offset
++      CFI_REL_OFFSET rbp, RBP+\offset
++      CFI_REL_OFFSET r12, R12+\offset
++      CFI_REL_OFFSET r13, R13+\offset
++      CFI_REL_OFFSET r14, R14+\offset
++      CFI_REL_OFFSET r15, R15+\offset
++      .endm
++
++        /*
++         * Must be consistent with the definition in arch-x86/xen-x86_64.h:
++         *     struct iret_context {
++         *        u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
++         *     };
++         * with rax, r11, and rcx being taken care of in the hypercall stub.
++         */
++      .macro HYPERVISOR_IRET flag
++      testb $3,1*8(%rsp)
++      jnz   2f
++      testl $NMI_MASK,2*8(%rsp)
++      jnz   2f
++
++      cmpb  $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip)
++      jne   1f
++
++      /* Direct iret to kernel space. Correct CS and SS. */
++      orl   $3,1*8(%rsp)
++      orl   $3,4*8(%rsp)
++1:    iretq
++
++2:    /* Slow iret via hypervisor. */
++      andl  $~NMI_MASK, 2*8(%rsp)
++      pushq $\flag
++      jmp  hypercall_page + (__HYPERVISOR_iret * 32)
++      .endm
++
++#ifndef CONFIG_XEN
++/* save partial stack frame */
++      .pushsection .kprobes.text, "ax"
++ENTRY(save_args)
++      XCPT_FRAME offset=ORIG_RAX-RBP+8
++      cld
++      /*
++       * start from rbp in pt_regs and jump over
++       * return address.
++       */
++      movq_cfi rdi, RDI+8-RBP
++      movq %rsi, RSI+8-RBP(%rsp)
++      movq %rdx, RDX+8-RBP(%rsp)
++      movq %rcx, RCX+8-RBP(%rsp)
++      movq_cfi rax, RAX+8-RBP
++      movq  %r8,  R8+8-RBP(%rsp)
++      movq  %r9,  R9+8-RBP(%rsp)
++      movq %r10, R10+8-RBP(%rsp)
++      movq %r11, R11+8-RBP(%rsp)
++
++      leaq -RBP+8(%rsp),%rdi  /* arg1 for handler */
++      movq_cfi rbp, 8         /* push %rbp */
++      leaq 8(%rsp), %rbp              /* mov %rsp, %ebp */
++      CFI_DEF_CFA_REGISTER rbp
++      CFI_ADJUST_CFA_OFFSET -8
++      testl $3, CS(%rdi)
++      je 1f
++      SWAPGS
++      /*
++       * irq_count is used to check if a CPU is already on an interrupt stack
++       * or not. While this is essentially redundant with preempt_count it is
++       * a little cheaper to use a separate counter in the PDA (short of
++       * moving irq_enter into assembly, which would be too much work)
++       */
++1:    incl PER_CPU_VAR(irq_count)
++      jne 2f
++      popq %rax                       /* move return address... */
++      mov PER_CPU_VAR(irq_stack_ptr),%rsp
++      pushq %rbp                      /* backlink for unwinder */
++      pushq %rax                      /* ... to the new stack */
++      /*
++       * We entered an interrupt context - irqs are off:
++       */
++2:    TRACE_IRQS_OFF
++      ret
++      CFI_ENDPROC
++END(save_args)
++      .popsection
++#endif
++
++ENTRY(save_rest)
++      CFI_STARTPROC
++      movq 5*8+16(%rsp), %r11 /* save return address */
++      movq %rbx, RBX+16(%rsp)
++      movq %rbp, RBP+16(%rsp)
++      movq %r12, R12+16(%rsp)
++      movq %r13, R13+16(%rsp)
++      movq %r14, R14+16(%rsp)
++      movq %r15, R15+16(%rsp)
++      movq %r11, 8(%rsp)      /* return address */
++      FIXUP_TOP_OF_STACK %r11, 16
++      ret
++      CFI_ENDPROC
++END(save_rest)
++
++#ifndef CONFIG_XEN
++/* save complete stack frame */
++      .pushsection .kprobes.text, "ax"
++ENTRY(save_paranoid)
++      XCPT_FRAME offset=ORIG_RAX-R15+8
++      cld
++      movq %rdi, RDI+8(%rsp)
++      movq %rsi, RSI+8(%rsp)
++      movq_cfi rdx, RDX+8
++      movq_cfi rcx, RCX+8
++      movq_cfi rax, RAX+8
++      movq %r8, R8+8(%rsp)
++      movq %r9, R9+8(%rsp)
++      movq %r10, R10+8(%rsp)
++      movq %r11, R11+8(%rsp)
++      movq_cfi rbx, RBX+8
++      movq %rbp, RBP+8(%rsp)
++      movq %r12, R12+8(%rsp)
++      movq %r13, R13+8(%rsp)
++      movq %r14, R14+8(%rsp)
++      movq %r15, R15+8(%rsp)
++      movl $1,%ebx
++      movl $MSR_GS_BASE,%ecx
++      rdmsr
++      testl %edx,%edx
++      js 1f   /* negative -> in kernel */
++      SWAPGS
++      xorl %ebx,%ebx
++1:    ret
++      CFI_ENDPROC
++END(save_paranoid)
++      .popsection
++#endif
++
++/*
++ * A newly forked process directly context switches into this address.
++ *
++ * rdi: prev task we switched from
++ */
++ENTRY(ret_from_fork)
++      DEFAULT_FRAME
++
++      LOCK ; btr $TIF_FORK,TI_flags(%r8)
++
++      pushq_cfi kernel_eflags(%rip)
++      popfq_cfi                               # reset kernel eflags
++
++      call schedule_tail                      # rdi: 'prev' task parameter
++
++      GET_THREAD_INFO(%rcx)
++
++      RESTORE_REST
++
++      testl $3, CS-ARGOFFSET(%rsp)            # from kernel_thread?
++      je   int_ret_from_sys_call
++
++      testl $_TIF_IA32, TI_flags(%rcx)        # 32-bit compat task needs IRET
++      jnz  int_ret_from_sys_call
++
++      RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
++      jmp ret_from_sys_call                   # go to the SYSRET fastpath
++
++      CFI_ENDPROC
++END(ret_from_fork)
++
++/*
++ * System call entry. Up to 6 arguments in registers are supported.
++ *
++ * SYSCALL does not save anything on the stack and does not change the
++ * stack pointer.
++ */
++
++/*
++ * Register setup:
++ * rax  system call number
++ * rdi  arg0
++ * rcx  return address for syscall/sysret, C arg3
++ * rsi  arg1
++ * rdx  arg2
++ * r10  arg3  (--> moved to rcx for C)
++ * r8   arg4
++ * r9   arg5
++ * r11  eflags for syscall/sysret, temporary for C
++ * r12-r15,rbp,rbx saved by C code, not touched.
++ *
++ * Interrupts are enabled on entry.
++ * Only called from user space.
++ *
++ * XXX        if we had a free scratch register we could save the RSP into the stack frame
++ *      and report it properly in ps. Unfortunately we haven't.
++ *
++ * When user can change the frames always force IRET. That is because
++ * it deals with uncanonical addresses better. SYSRET has trouble
++ * with them due to bugs in both AMD and Intel CPUs.
++ */
++
++ENTRY(system_call)
++      INTR_FRAME start=2 offset=2*8
++      SAVE_ARGS -8,0
++      movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
++      GET_THREAD_INFO(%rcx)
++      testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
++      jnz tracesys
++system_call_fastpath:
++      cmpq $__NR_syscall_max,%rax
++      ja badsys
++      movq %r10,%rcx
++      call *sys_call_table(,%rax,8)  # XXX:    rip relative
++      movq %rax,RAX-ARGOFFSET(%rsp)
++/*
++ * Syscall return path ending with SYSRET (fast path)
++ * Has incomplete stack frame and undefined top of stack.
++ */
++ret_from_sys_call:
++      movl $_TIF_ALLWORK_MASK,%edi
++      /* edi: flagmask */
++sysret_check:
++      LOCKDEP_SYS_EXIT
++      GET_THREAD_INFO(%rcx)
++      DISABLE_INTERRUPTS(CLBR_NONE)
++      TRACE_IRQS_OFF
++      movl TI_flags(%rcx),%edx
++      andl %edi,%edx
++      jnz  sysret_careful
++      CFI_REMEMBER_STATE
++      /*
++       * sysretq will re-enable interrupts:
++       */
++      TRACE_IRQS_ON
++      ENABLE_INTERRUPTS(CLBR_NONE)
++      RESTORE_ARGS 0,8,0
++        HYPERVISOR_IRET VGCF_IN_SYSCALL
++
++      CFI_RESTORE_STATE
++      /* Handle reschedules */
++      /* edx: work, edi: workmask */
++sysret_careful:
++      bt $TIF_NEED_RESCHED,%edx
++      jnc sysret_signal
++      TRACE_IRQS_ON
++      ENABLE_INTERRUPTS(CLBR_NONE)
++      pushq_cfi %rdi
++      call schedule
++      popq_cfi %rdi
++      jmp sysret_check
++
++      /* Handle a signal */
++sysret_signal:
++      TRACE_IRQS_ON
++      ENABLE_INTERRUPTS(CLBR_NONE)
++#ifdef CONFIG_AUDITSYSCALL
++      bt $TIF_SYSCALL_AUDIT,%edx
++      jc sysret_audit
++#endif
++      /*
++       * We have a signal, or exit tracing or single-step.
++       * These all wind up with the iret return path anyway,
++       * so just join that path right now.
++       */
++      FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
++      jmp int_check_syscall_exit_work
++
++badsys:
++      movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
++      jmp ret_from_sys_call
++
++#ifdef CONFIG_AUDITSYSCALL
++      /*
++       * Fast path for syscall audit without full syscall trace.
++       * We just call audit_syscall_entry() directly, and then
++       * jump back to the normal fast path.
++       */
++auditsys:
++      movq %r10,%r9                   /* 6th arg: 4th syscall arg */
++      movq %rdx,%r8                   /* 5th arg: 3rd syscall arg */
++      movq %rsi,%rcx                  /* 4th arg: 2nd syscall arg */
++      movq %rdi,%rdx                  /* 3rd arg: 1st syscall arg */
++      movq %rax,%rsi                  /* 2nd arg: syscall number */
++      movl $AUDIT_ARCH_X86_64,%edi    /* 1st arg: audit arch */
++      call audit_syscall_entry
++      LOAD_ARGS 0             /* reload call-clobbered registers */
++      jmp system_call_fastpath
++
++      /*
++       * Return fast path for syscall audit.  Call audit_syscall_exit()
++       * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
++       * masked off.
++       */
++sysret_audit:
++      movq RAX-ARGOFFSET(%rsp),%rsi   /* second arg, syscall return value */
++      cmpq $0,%rsi            /* is it < 0? */
++      setl %al                /* 1 if so, 0 if not */
++      movzbl %al,%edi         /* zero-extend that into %edi */
++      inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
++      call audit_syscall_exit
++      movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
++      jmp sysret_check
++#endif        /* CONFIG_AUDITSYSCALL */
++
++      /* Do syscall tracing */
++tracesys:
++#ifdef CONFIG_AUDITSYSCALL
++      testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
++      jz auditsys
++#endif
++      SAVE_REST
++      movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
++      FIXUP_TOP_OF_STACK %rdi
++      movq %rsp,%rdi
++      call syscall_trace_enter
++      /*
++       * Reload arg registers from stack in case ptrace changed them.
++       * We don't reload %rax because syscall_trace_enter() returned
++       * the value it wants us to use in the table lookup.
++       */
++      LOAD_ARGS ARGOFFSET, 1
++      RESTORE_REST
++      cmpq $__NR_syscall_max,%rax
++      ja   int_ret_from_sys_call      /* RAX(%rsp) set to -ENOSYS above */
++      movq %r10,%rcx  /* fixup for C */
++      call *sys_call_table(,%rax,8)
++      movq %rax,RAX-ARGOFFSET(%rsp)
++      /* Use IRET because user could have changed frame */
++
++/*
++ * Syscall return path ending with IRET.
++ * Has correct top of stack, but partial stack frame.
++ */
++GLOBAL(int_ret_from_sys_call)
++      DISABLE_INTERRUPTS(CLBR_NONE)
++      TRACE_IRQS_OFF
++      testb $3,CS-ARGOFFSET(%rsp)
++        jnz 1f
++        /* Need to set the proper %ss (not NULL) for ring 3 iretq */
++        movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
++        jmp retint_restore_args   # retrun from ring3 kernel
++1:              
++      movl $_TIF_ALLWORK_MASK,%edi
++      /* edi: mask to check */
++GLOBAL(int_with_check)
++      LOCKDEP_SYS_EXIT_IRQ
++      GET_THREAD_INFO(%rcx)
++      movl TI_flags(%rcx),%edx
++      andl %edi,%edx
++      jnz   int_careful
++      andl    $~TS_COMPAT,TI_status(%rcx)
++      jmp   retint_restore_args
++
++      /* Either reschedule or signal or syscall exit tracking needed. */
++      /* First do a reschedule test. */
++      /* edx: work, edi: workmask */
++int_careful:
++      bt $TIF_NEED_RESCHED,%edx
++      jnc  int_very_careful
++      TRACE_IRQS_ON
++      ENABLE_INTERRUPTS(CLBR_NONE)
++      pushq_cfi %rdi
++      call schedule
++      popq_cfi %rdi
++      DISABLE_INTERRUPTS(CLBR_NONE)
++      TRACE_IRQS_OFF
++      jmp int_with_check
++
++      /* handle signals and tracing -- both require a full stack frame */
++int_very_careful:
++      TRACE_IRQS_ON
++      ENABLE_INTERRUPTS(CLBR_NONE)
++int_check_syscall_exit_work:
++      SAVE_REST
++      /* Check for syscall exit trace */
++      testl $_TIF_WORK_SYSCALL_EXIT,%edx
++      jz int_signal
++      pushq_cfi %rdi
++      leaq 8(%rsp),%rdi       # &ptregs -> arg1
++      call syscall_trace_leave
++      popq_cfi %rdi
++      andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
++      jmp int_restore_rest
++
++int_signal:
++      testl $_TIF_DO_NOTIFY_MASK,%edx
++      jz 1f
++      movq %rsp,%rdi          # &ptregs -> arg1
++      xorl %esi,%esi          # oldset -> arg2
++      call do_notify_resume
++1:    movl $_TIF_WORK_MASK,%edi
++int_restore_rest:
++      RESTORE_REST
++      DISABLE_INTERRUPTS(CLBR_NONE)
++      TRACE_IRQS_OFF
++      jmp int_with_check
++      CFI_ENDPROC
++END(system_call)
++
++/*
++ * Certain special system calls that need to save a complete full stack frame.
++ */
++      .macro PTREGSCALL label,func,arg
++ENTRY(\label)
++      PARTIAL_FRAME 1 8               /* offset 8: return address */
++      subq $REST_SKIP, %rsp
++      CFI_ADJUST_CFA_OFFSET REST_SKIP
++      call save_rest
++      DEFAULT_FRAME -2 8              /* offset 8: return address */
++      leaq 8(%rsp), \arg      /* pt_regs pointer */
++      call \func
++      jmp ptregscall_common
++      CFI_ENDPROC
++END(\label)
++      .endm
++
++      PTREGSCALL stub_clone, sys_clone, %r8
++      PTREGSCALL stub_fork, sys_fork, %rdi
++      PTREGSCALL stub_vfork, sys_vfork, %rdi
++      PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
++      PTREGSCALL stub_iopl, sys_iopl, %rsi
++
++ENTRY(ptregscall_common)
++      DEFAULT_FRAME 1 8       /* offset 8: return address */
++      RESTORE_TOP_OF_STACK %r11, 8
++      movq_cfi_restore R15+8, r15
++      movq_cfi_restore R14+8, r14
++      movq_cfi_restore R13+8, r13
++      movq_cfi_restore R12+8, r12
++      movq_cfi_restore RBP+8, rbp
++      movq_cfi_restore RBX+8, rbx
++      ret $REST_SKIP          /* pop extended registers */
++      CFI_ENDPROC
++END(ptregscall_common)
++
++ENTRY(stub_execve)
++      CFI_STARTPROC
++      addq $8, %rsp
++      PARTIAL_FRAME 0
++      SAVE_REST
++      FIXUP_TOP_OF_STACK %r11
++      movq %rsp, %rcx
++      call sys_execve
++      RESTORE_TOP_OF_STACK %r11
++      movq %rax,RAX(%rsp)
++      RESTORE_REST
++      jmp int_ret_from_sys_call
++      CFI_ENDPROC
++END(stub_execve)
++
++/*
++ * sigreturn is special because it needs to restore all registers on return.
++ * This cannot be done with SYSRET, so use the IRET return path instead.
++ */
++ENTRY(stub_rt_sigreturn)
++      CFI_STARTPROC
++      addq $8, %rsp
++      PARTIAL_FRAME 0
++      SAVE_REST
++      movq %rsp,%rdi
++      FIXUP_TOP_OF_STACK %r11
++      call sys_rt_sigreturn
++      movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
++      RESTORE_REST
++      jmp int_ret_from_sys_call
++      CFI_ENDPROC
++END(stub_rt_sigreturn)
++
++/*
++ * Interrupt exit.
++ */ 
++
++retint_with_reschedule:
++      PARTIAL_FRAME
++      movl $_TIF_WORK_MASK,%edi
++retint_check:
++      LOCKDEP_SYS_EXIT_IRQ
++      movl TI_flags(%rcx),%edx
++      andl %edi,%edx
++      CFI_REMEMBER_STATE
++      jnz  retint_careful
++retint_restore_args:  /* return to kernel space */
++      movl EFLAGS-REST_SKIP(%rsp), %eax
++      shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
++      GET_VCPU_INFO
++      andb evtchn_upcall_mask(%rsi),%al
++      andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
++      jnz restore_all_enable_events   #        != 0 => enable event delivery
++              
++      RESTORE_ARGS 0,8,0
++      HYPERVISOR_IRET 0
++      
++      /* edi: workmask, edx: work */
++retint_careful:
++      CFI_RESTORE_STATE
++      bt    $TIF_NEED_RESCHED,%edx
++      jnc   retint_signal
++      TRACE_IRQS_ON
++      ENABLE_INTERRUPTS(CLBR_NONE)
++      pushq_cfi %rdi
++      call  schedule
++      popq_cfi %rdi
++      GET_THREAD_INFO(%rcx)
++      DISABLE_INTERRUPTS(CLBR_NONE)
++      TRACE_IRQS_OFF
++      jmp retint_check
++
++retint_signal:
++      testl $_TIF_DO_NOTIFY_MASK,%edx
++      jz    retint_restore_args
++      TRACE_IRQS_ON
++      ENABLE_INTERRUPTS(CLBR_NONE)
++      SAVE_REST
++      movq $-1,ORIG_RAX(%rsp)
++      xorl %esi,%esi          # oldset
++      movq %rsp,%rdi          # &pt_regs
++      call do_notify_resume
++      RESTORE_REST
++      DISABLE_INTERRUPTS(CLBR_NONE)
++      TRACE_IRQS_OFF
++      GET_THREAD_INFO(%rcx)
++      jmp retint_with_reschedule
++
++#ifdef CONFIG_PREEMPT
++      /* Returning to kernel space. Check if we need preemption */
++      /* rcx:  threadinfo. interrupts off. */
++ENTRY(retint_kernel)
++      cmpl $0,TI_preempt_count(%rcx)
++      jnz  retint_restore_args
++      bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
++      jnc  retint_restore_args
++      bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
++      jnc  retint_restore_args
++      call preempt_schedule_irq
++      jmp retint_kernel       /* check again */
++#endif
++
++      CFI_ENDPROC
++END(retint_check)
++
++#ifndef CONFIG_XEN
++/*
++ * APIC interrupts.
++ */
++.macro apicinterrupt num sym do_sym
++ENTRY(\sym)
++      INTR_FRAME
++      pushq_cfi $~(\num)
++      interrupt \do_sym
++      jmp error_entry
++      CFI_ENDPROC
++END(\sym)
++.endm
++
++#ifdef CONFIG_SMP
++apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
++      irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
++apicinterrupt REBOOT_VECTOR \
++      reboot_interrupt smp_reboot_interrupt
++#endif
++
++#ifdef CONFIG_X86_UV
++apicinterrupt UV_BAU_MESSAGE \
++      uv_bau_message_intr1 uv_bau_message_interrupt
++#endif
++apicinterrupt LOCAL_TIMER_VECTOR \
++      apic_timer_interrupt smp_apic_timer_interrupt
++apicinterrupt X86_PLATFORM_IPI_VECTOR \
++      x86_platform_ipi smp_x86_platform_ipi
++
++#ifdef CONFIG_SMP
++.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
++      16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
++.if NUM_INVALIDATE_TLB_VECTORS > \idx
++apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
++      invalidate_interrupt\idx smp_invalidate_interrupt
++.endif
++.endr
++#endif
++
++apicinterrupt THRESHOLD_APIC_VECTOR \
++      threshold_interrupt smp_threshold_interrupt
++apicinterrupt THERMAL_APIC_VECTOR \
++      thermal_interrupt smp_thermal_interrupt
++
++#ifdef CONFIG_X86_MCE
++apicinterrupt MCE_SELF_VECTOR \
++      mce_self_interrupt smp_mce_self_interrupt
++#endif
++
++#ifdef CONFIG_SMP
++apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
++      call_function_single_interrupt smp_call_function_single_interrupt
++apicinterrupt CALL_FUNCTION_VECTOR \
++      call_function_interrupt smp_call_function_interrupt
++apicinterrupt RESCHEDULE_VECTOR \
++      reschedule_interrupt smp_reschedule_interrupt
++#endif
++
++apicinterrupt ERROR_APIC_VECTOR \
++      error_interrupt smp_error_interrupt
++apicinterrupt SPURIOUS_APIC_VECTOR \
++      spurious_interrupt smp_spurious_interrupt
++
++#ifdef CONFIG_IRQ_WORK
++apicinterrupt IRQ_WORK_VECTOR \
++      irq_work_interrupt smp_irq_work_interrupt
++#endif
++#endif /* !CONFIG_XEN */
++
++/*
++ * Exception entry points.
++ */
++.macro zeroentry sym do_sym
++ENTRY(\sym)
++      INTR_FRAME
++        movq (%rsp),%rcx
++      CFI_RESTORE rcx
++        movq 8(%rsp),%r11
++      CFI_RESTORE r11
++      movq $-1,8(%rsp)        /* ORIG_RAX: no syscall to restart */
++      subq $ORIG_RAX-R15-1*8,%rsp
++      CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-1*8
++      call error_entry
++      DEFAULT_FRAME -1
++      movq %rsp,%rdi          /* pt_regs pointer */
++      xorl %esi,%esi          /* no error code */
++      call \do_sym
++      jmp error_exit          /* %ebx: no swapgs flag */
++      CFI_ENDPROC
++END(\sym)
++.endm
++
++.macro paranoidzeroentry sym do_sym
++      zeroentry \sym \do_sym
++.endm
++
++.macro paranoidzeroentry_ist sym do_sym ist
++      zeroentry \sym \do_sym
++.endm
++
++.macro errorentry sym do_sym
++ENTRY(\sym)
++      XCPT_FRAME
++        movq (%rsp),%rcx
++      CFI_RESTORE rcx
++        movq 8(%rsp),%r11
++      CFI_RESTORE r11
++      subq $ORIG_RAX-R15-2*8,%rsp
++      CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-2*8
++      call error_entry
++      DEFAULT_FRAME -1
++      movq %rsp,%rdi                  /* pt_regs pointer */
++      movq ORIG_RAX(%rsp),%rsi        /* get error code */
++      movq $-1,ORIG_RAX(%rsp)         /* no syscall to restart */
++      call \do_sym
++      jmp error_exit                  /* %ebx: no swapgs flag */
++      CFI_ENDPROC
++END(\sym)
++.endm
++
++      /* error code is on the stack already */
++.macro paranoiderrorentry sym do_sym
++      errorentry \sym \do_sym
++.endm
++
++/*
++ * Copied from arch/xen/i386/kernel/entry.S
++ */               
++# A note on the "critical region" in our callback handler.
++# We want to avoid stacking callback handlers due to events occurring
++# during handling of the last event. To do this, we keep events disabled
++# until we've done all processing. HOWEVER, we must enable events before
++# popping the stack frame (can't be done atomically) and so it would still
++# be possible to get enough handler activations to overflow the stack.
++# Although unlikely, bugs of that kind are hard to track down, so we'd
++# like to avoid the possibility.
++# So, on entry to the handler we detect whether we interrupted an
++# existing activation in its critical region -- if so, we pop the current
++# activation and restart the handler using the previous one.
++ENTRY(do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
++      CFI_STARTPROC
++# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
++# see the correct pointer to the pt_regs
++      movq %rdi, %rsp            # we don't return, adjust the stack frame
++      CFI_ENDPROC
++      DEFAULT_FRAME
++11:   incl PER_CPU_VAR(irq_count)
++      movq %rsp,%rbp
++      CFI_DEF_CFA_REGISTER rbp
++      cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
++      pushq %rbp                      # backlink for old unwinder
++      call evtchn_do_upcall
++      popq %rsp
++      CFI_DEF_CFA_REGISTER rsp
++      decl PER_CPU_VAR(irq_count)
++      jmp  error_exit
++      CFI_ENDPROC
++END(do_hypervisor_callback)
++
++        ALIGN
++restore_all_enable_events:  
++      PARTIAL_FRAME
++      TRACE_IRQS_ON
++      __ENABLE_INTERRUPTS
++
++scrit:        /**** START OF CRITICAL REGION ****/
++      __TEST_PENDING
++      CFI_REMEMBER_STATE
++      jnz  14f                        # process more events if necessary...
++        RESTORE_ARGS 0,8,0
++        HYPERVISOR_IRET 0
++        
++      CFI_RESTORE_STATE
++14:   __DISABLE_INTERRUPTS
++      SAVE_REST
++        movq %rsp,%rdi                  # set the argument again
++      jmp  11b
++      CFI_ENDPROC
++ecrit:  /**** END OF CRITICAL REGION ****/
++# At this point, unlike on x86-32, we don't do the fixup to simplify the 
++# code and the stack frame is more complex on x86-64.
++# When the kernel is interrupted in the critical section, the kernel 
++# will do IRET in that case, and everything will be restored at that point, 
++# i.e. it just resumes from the next instruction interrupted with the same context. 
++
++# Hypervisor uses this for application faults while it executes.
++# We get here for two reasons:
++#  1. Fault while reloading DS, ES, FS or GS
++#  2. Fault while executing IRET
++# Category 1 we do not need to fix up as Xen has already reloaded all segment
++# registers that could be reloaded and zeroed the others.
++# Category 2 we fix up by killing the current process. We cannot use the
++# normal Linux return path in this case because if we use the IRET hypercall
++# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
++# We distinguish between categories by comparing each saved segment register
++# with its current contents: any discrepancy means we in category 1.
++ENTRY(failsafe_callback)
++      INTR_FRAME offset=4*8
++      movw %ds,%cx
++      cmpw %cx,0x10(%rsp)
++      CFI_REMEMBER_STATE
++      jne 1f
++      movw %es,%cx
++      cmpw %cx,0x18(%rsp)
++      jne 1f
++      movw %fs,%cx
++      cmpw %cx,0x20(%rsp)
++      jne 1f
++      movw %gs,%cx
++      cmpw %cx,0x28(%rsp)
++      jne 1f
++      /* All segments match their saved values => Category 2 (Bad IRET). */
++      movq (%rsp),%rcx
++      CFI_RESTORE rcx
++      movq 8(%rsp),%r11
++      CFI_RESTORE r11
++      addq $0x30,%rsp
++      CFI_ADJUST_CFA_OFFSET -0x30
++      movq $11,%rdi   /* SIGSEGV */
++      jmp do_exit                     
++      CFI_RESTORE_STATE
++1:    /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
++      movq (%rsp),%rcx
++      CFI_RESTORE rcx
++      movq 8(%rsp),%r11
++      CFI_RESTORE r11
++      addq $0x30,%rsp
++      CFI_ADJUST_CFA_OFFSET -0x30
++      pushq_cfi $0
++      SAVE_ALL
++      jmp error_exit
++      CFI_ENDPROC
++
++zeroentry divide_error do_divide_error
++zeroentry overflow do_overflow
++zeroentry bounds do_bounds
++zeroentry invalid_op do_invalid_op
++zeroentry device_not_available do_device_not_available
++zeroentry hypervisor_callback do_hypervisor_callback
++zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
++errorentry invalid_TSS do_invalid_TSS
++errorentry segment_not_present do_segment_not_present
++zeroentry coprocessor_error do_coprocessor_error
++errorentry alignment_check do_alignment_check
++zeroentry simd_coprocessor_error do_simd_coprocessor_error
++      
++ENTRY(kernel_thread_helper)
++      pushq $0                # fake return address
++      CFI_STARTPROC
++      /*
++       * Here we are in the child and the registers are set as they were
++       * at kernel_thread() invocation in the parent.
++       */
++      call *%rsi
++      # exit
++      mov %eax, %edi
++      call do_exit
++      ud2                     # padding for call trace
++      CFI_ENDPROC
++END(kernel_thread_helper)
++
++/*
++ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
++ *
++ * C extern interface:
++ *     extern long execve(const char *name, char **argv, char **envp)
++ *
++ * asm input arguments:
++ *    rdi: name, rsi: argv, rdx: envp
++ *
++ * We want to fallback into:
++ *    extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
++ *
++ * do_sys_execve asm fallback arguments:
++ *    rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
++ */
++ENTRY(kernel_execve)
++      CFI_STARTPROC
++      FAKE_STACK_FRAME $0
++      SAVE_ALL
++      movq %rsp,%rcx
++      call sys_execve
++      movq %rax, RAX(%rsp)
++      RESTORE_REST
++      testq %rax,%rax
++      jne 1f
++        jmp int_ret_from_sys_call
++1:      RESTORE_ARGS
++      UNFAKE_STACK_FRAME
++      ret
++      CFI_ENDPROC
++END(kernel_execve)
++
++/* Call softirq on interrupt stack. Interrupts are off. */
++ENTRY(call_softirq)
++      CFI_STARTPROC
++      pushq_cfi %rbp
++      CFI_REL_OFFSET rbp,0
++      mov  %rsp,%rbp
++      CFI_DEF_CFA_REGISTER rbp
++      incl PER_CPU_VAR(irq_count)
++      cmove PER_CPU_VAR(irq_stack_ptr),%rsp
++      push  %rbp                      # backlink for old unwinder
++      call __do_softirq
++      leaveq
++      CFI_RESTORE             rbp
++      CFI_DEF_CFA_REGISTER    rsp
++      CFI_ADJUST_CFA_OFFSET   -8
++      decl PER_CPU_VAR(irq_count)
++      ret
++      CFI_ENDPROC
++END(call_softirq)
++
++#ifdef CONFIG_STACK_UNWIND
++ENTRY(arch_unwind_init_running)
++      CFI_STARTPROC
++      movq    %r15, R15(%rdi)
++      movq    %r14, R14(%rdi)
++      xchgq   %rsi, %rdx
++      movq    %r13, R13(%rdi)
++      movq    %r12, R12(%rdi)
++      xorl    %eax, %eax
++      movq    %rbp, RBP(%rdi)
++      movq    %rbx, RBX(%rdi)
++      movq    (%rsp), %r9
++      xchgq   %rdx, %rcx
++      movq    %rax, R11(%rdi)
++      movq    %rax, R10(%rdi)
++      movq    %rax, R9(%rdi)
++      movq    %rax, R8(%rdi)
++      movq    %rax, RAX(%rdi)
++      movq    %rax, RCX(%rdi)
++      movq    %rax, RDX(%rdi)
++      movq    %rax, RSI(%rdi)
++      movq    %rax, RDI(%rdi)
++      movq    %rax, ORIG_RAX(%rdi)
++      movq    %r9, RIP(%rdi)
++      leaq    8(%rsp), %r9
++      movq    $__KERNEL_CS, CS(%rdi)
++      movq    %rax, EFLAGS(%rdi)
++      movq    %r9, RSP(%rdi)
++      movq    $__KERNEL_DS, SS(%rdi)
++      jmpq    *%rcx
++      CFI_ENDPROC
++END(arch_unwind_init_running)
++#endif
++
++/*
++ * Some functions should be protected against kprobes
++ */
++      .pushsection .kprobes.text, "ax"
++
++paranoidzeroentry_ist debug do_debug DEBUG_STACK
++zeroentry nmi do_nmi_callback
++paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
++paranoiderrorentry stack_segment do_stack_segment
++errorentry general_protection do_general_protection
++errorentry page_fault do_page_fault
++#ifdef CONFIG_KVM_GUEST
++errorentry async_page_fault do_async_page_fault
++#endif
++#ifdef CONFIG_X86_MCE
++paranoidzeroentry machine_check *machine_check_vector(%rip)
++#endif
++
++#ifndef CONFIG_XEN
++      /*
++       * "Paranoid" exit path from exception stack.
++       * Paranoid because this is used by NMIs and cannot take
++       * any kernel state for granted.
++       * We don't do kernel preemption checks here, because only
++       * NMI should be common and it does not enable IRQs and
++       * cannot get reschedule ticks.
++       *
++       * "trace" is 0 for the NMI handler only, because irq-tracing
++       * is fundamentally NMI-unsafe. (we cannot change the soft and
++       * hard flags at once, atomically)
++       */
++
++      /* ebx: no swapgs flag */
++ENTRY(paranoid_exit)
++      DEFAULT_FRAME
++      DISABLE_INTERRUPTS(CLBR_NONE)
++      TRACE_IRQS_OFF
++      testl %ebx,%ebx                         /* swapgs needed? */
++      jnz paranoid_restore
++      testl $3,CS(%rsp)
++      jnz   paranoid_userspace
++paranoid_swapgs:
++      TRACE_IRQS_IRETQ 0
++      SWAPGS_UNSAFE_STACK
++      RESTORE_ALL 8
++      jmp irq_return
++paranoid_restore:
++      TRACE_IRQS_IRETQ 0
++      RESTORE_ALL 8
++      jmp irq_return
++paranoid_userspace:
++      GET_THREAD_INFO(%rcx)
++      movl TI_flags(%rcx),%ebx
++      andl $_TIF_WORK_MASK,%ebx
++      jz paranoid_swapgs
++      movq %rsp,%rdi                  /* &pt_regs */
++      call sync_regs
++      movq %rax,%rsp                  /* switch stack for scheduling */
++      testl $_TIF_NEED_RESCHED,%ebx
++      jnz paranoid_schedule
++      movl %ebx,%edx                  /* arg3: thread flags */
++      TRACE_IRQS_ON
++      ENABLE_INTERRUPTS(CLBR_NONE)
++      xorl %esi,%esi                  /* arg2: oldset */
++      movq %rsp,%rdi                  /* arg1: &pt_regs */
++      call do_notify_resume
++      DISABLE_INTERRUPTS(CLBR_NONE)
++      TRACE_IRQS_OFF
++      jmp paranoid_userspace
++paranoid_schedule:
++      TRACE_IRQS_ON
++      ENABLE_INTERRUPTS(CLBR_ANY)
++      call schedule
++      DISABLE_INTERRUPTS(CLBR_ANY)
++      TRACE_IRQS_OFF
++      jmp paranoid_userspace
++      CFI_ENDPROC
++END(paranoid_exit)
++#endif
++
++/*
++ * Exception entry point. This expects an error code/orig_rax on the stack.
++ * returns in "no swapgs flag" in %ebx.
++ */
++ENTRY(error_entry)
++      XCPT_FRAME start=2 offset=ORIG_RAX-R15+8
++      /* oldrax contains error code */
++      cld
++      movq %rdi, RDI+8(%rsp)
++      movq %rsi, RSI+8(%rsp)
++      movq %rdx, RDX+8(%rsp)
++      movq %rcx, RCX+8(%rsp)
++      movq %rax, RAX+8(%rsp)
++      movq  %r8,  R8+8(%rsp)
++      movq  %r9,  R9+8(%rsp)
++      movq %r10, R10+8(%rsp)
++      movq %r11, R11+8(%rsp)
++      movq_cfi rbx, RBX+8
++      movq %rbp, RBP+8(%rsp)
++      movq %r12, R12+8(%rsp)
++      movq %r13, R13+8(%rsp)
++      movq %r14, R14+8(%rsp)
++      movq %r15, R15+8(%rsp)
++#ifndef CONFIG_XEN
++      xorl %ebx,%ebx
++      testl $3,CS+8(%rsp)
++      je error_kernelspace
++error_swapgs:
++      SWAPGS
++error_sti:
++#endif
++      TRACE_IRQS_OFF
++      ret
++
++#ifndef CONFIG_XEN
++/*
++ * There are two places in the kernel that can potentially fault with
++ * usergs. Handle them here. The exception handlers after iret run with
++ * kernel gs again, so don't set the user space flag. B stepping K8s
++ * sometimes report an truncated RIP for IRET exceptions returning to
++ * compat mode. Check for these here too.
++ */
++error_kernelspace:
++      CFI_REL_OFFSET rcx, RCX+8
++      incl %ebx
++      leaq irq_return(%rip),%rcx
++      cmpq %rcx,RIP+8(%rsp)
++      je error_swapgs
++      movl %ecx,%eax  /* zero extend */
++      cmpq %rax,RIP+8(%rsp)
++      je bstep_iret
++      cmpq $gs_change,RIP+8(%rsp)
++      je error_swapgs
++      jmp error_sti
++
++bstep_iret:
++      /* Fix truncated RIP */
++      movq %rcx,RIP+8(%rsp)
++      jmp error_swapgs
++#endif
++      CFI_ENDPROC
++END(error_entry)
++
++
++ENTRY(error_exit)
++      DEFAULT_FRAME
++      RESTORE_REST
++      DISABLE_INTERRUPTS(CLBR_NONE)
++      TRACE_IRQS_OFF
++      GET_THREAD_INFO(%rcx)
++      testb $3,CS-ARGOFFSET(%rsp)
++      jz retint_kernel
++      LOCKDEP_SYS_EXIT_IRQ
++      movl TI_flags(%rcx),%edx
++      movl $_TIF_WORK_MASK,%edi
++      andl %edi,%edx
++      jnz retint_careful
++      jmp retint_restore_args
++      CFI_ENDPROC
++END(error_exit)
++
++
++do_nmi_callback:
++      CFI_STARTPROC
++      addq $8, %rsp
++      CFI_ENDPROC
++      DEFAULT_FRAME
++      call do_nmi
++      orl  $NMI_MASK,EFLAGS(%rsp)
++      RESTORE_REST
++      DISABLE_INTERRUPTS(CLBR_NONE)
++      TRACE_IRQS_OFF
++      GET_THREAD_INFO(%rcx)
++      jmp  retint_restore_args
++      CFI_ENDPROC
++END(do_nmi_callback)
++
++
++#ifndef CONFIG_IA32_EMULATION
++ENTRY(ignore_sysret)
++      INTR_FRAME
++      popq_cfi %rcx
++      CFI_RESTORE rcx
++      popq_cfi %r11
++      CFI_RESTORE r11
++      mov $-ENOSYS,%eax
++      HYPERVISOR_IRET 0
++      CFI_ENDPROC
++END(ignore_sysret)
++#endif
++
++/*
++ * End of kprobes section
++ */
++      .popsection
diff --cc arch/x86/kernel/entry_64.S

index d4b9066,8a445a0..935aa1a
--- 1/arch/x86/kernel/entry_64.S
--- 2/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@@ -1224,41 -1217,7 +1224,41 @@@ ENTRY(call_softirq
         CFI_ENDPROC
   END(call_softirq)
   
- -#ifdef CONFIG_XEN
+ +#ifdef CONFIG_STACK_UNWIND
+ +ENTRY(arch_unwind_init_running)
+ +      CFI_STARTPROC
+ +      movq    %r15, R15(%rdi)
+ +      movq    %r14, R14(%rdi)
+ +      xchgq   %rsi, %rdx
+ +      movq    %r13, R13(%rdi)
+ +      movq    %r12, R12(%rdi)
+ +      xorl    %eax, %eax
+ +      movq    %rbp, RBP(%rdi)
+ +      movq    %rbx, RBX(%rdi)
+ +      movq    (%rsp), %r9
+ +      xchgq   %rdx, %rcx
+ +      movq    %rax, R11(%rdi)
+ +      movq    %rax, R10(%rdi)
+ +      movq    %rax, R9(%rdi)
+ +      movq    %rax, R8(%rdi)
+ +      movq    %rax, RAX(%rdi)
+ +      movq    %rax, RCX(%rdi)
+ +      movq    %rax, RDX(%rdi)
+ +      movq    %rax, RSI(%rdi)
+ +      movq    %rax, RDI(%rdi)
+ +      movq    %rax, ORIG_RAX(%rdi)
+ +      movq    %r9, RIP(%rdi)
+ +      leaq    8(%rsp), %r9
+ +      movq    $__KERNEL_CS, CS(%rdi)
+ +      movq    %rax, EFLAGS(%rdi)
+ +      movq    %r9, RSP(%rdi)
+ +      movq    $__KERNEL_DS, SS(%rdi)
+ +      jmpq    *%rcx
+ +      CFI_ENDPROC
+ +END(arch_unwind_init_running)
+ +#endif
+ +
- #ifdef CONFIG_XEN
++#ifdef CONFIG_PARAVIRT_XEN
   zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
   
   /*
@@@ -1358,7 -1317,7 +1358,7 @@@ END(xen_failsafe_callback
   apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
         xen_hvm_callback_vector xen_evtchn_do_upcall
   
--#endif /* CONFIG_XEN */
++#endif /* CONFIG_PARAVIRT_XEN */
   
   /*
    * Some functions should be protected against kprobes
@@@ -1368,7 -1327,7 +1368,7 @@@
   paranoidzeroentry_ist debug do_debug DEBUG_STACK
   paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
   paranoiderrorentry stack_segment do_stack_segment
--#ifdef CONFIG_XEN
++#ifdef CONFIG_PARAVIRT_XEN
   zeroentry xen_debug do_debug
   zeroentry xen_int3 do_int3
   errorentry xen_stack_segment do_stack_segment
diff --cc arch/x86/kernel/fixup.c

index 0000000,0000000..64cd323

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/fixup.c
@@@ -1,0 -1,0 +1,89 @@@
++/******************************************************************************
++ * fixup.c
++ * 
++ * Binary-rewriting of certain IA32 instructions, on notification by Xen.
++ * Used to avoid repeated slow emulation of common instructions used by the
++ * user-space TLS (Thread-Local Storage) libraries.
++ * 
++ * **** NOTE ****
++ *  Issues with the binary rewriting have caused it to be removed. Instead
++ *  we rely on Xen's emulator to boot the kernel, and then print a banner
++ *  message recommending that the user disables /lib/tls.
++ * 
++ * Copyright (c) 2004, K A Fraser
++ * 
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ * 
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ * 
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ */
++
++#include <linux/init.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/kernel.h>
++#include <linux/delay.h>
++#include <linux/version.h>
++#include <asm/traps.h>
++
++#define DP(_f, _args...) pr_alert("  " _f "\n" , ## _args )
++
++dotraplinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
++{
++      static unsigned long printed = 0;
++      char info[100];
++      int i;
++
++      /* Ignore statically-linked init. */
++      if (current->tgid == 1)
++              return;
++            
++      VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable,
++                                VMASST_TYPE_4gb_segments_notify));
++
++      if (test_and_set_bit(0, &printed))
++              return;
++
++      sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
++
++      DP("");
++      DP("***************************************************************");
++      DP("***************************************************************");
++      DP("** WARNING: Currently emulating unsupported memory accesses  **");
++      DP("**          in /lib/tls glibc libraries. The emulation is    **");
++      DP("**          slow. To ensure full performance you should      **");
++      DP("**          install a 'xen-friendly' (nosegneg) version of   **");
++      DP("**          the library, or disable tls support by executing **");
++      DP("**          the following as root:                           **");
++      DP("**          mv /lib/tls /lib/tls.disabled                    **");
++      DP("** Offending process: %-38.38s **", info);
++      DP("***************************************************************");
++      DP("***************************************************************");
++      DP("");
++
++      for (i = 5; i > 0; i--) {
++              touch_softlockup_watchdog();
++              printk("Pausing... %d", i);
++              mdelay(1000);
++              printk("\b\b\b\b\b\b\b\b\b\b\b\b");
++      }
++
++      printk("Continuing...\n\n");
++}
++
++static int __init fixup_init(void)
++{
++      WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
++                                   VMASST_TYPE_4gb_segments_notify));
++      return 0;
++}
++__initcall(fixup_init);
diff --cc arch/x86/kernel/head-xen.c

index 0000000,0000000..c20a352

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/head-xen.c
@@@ -1,0 -1,0 +1,219 @@@
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/memblock.h>
++#include <linux/pci.h>
++
++#include <asm/setup.h>
++#ifndef CONFIG_XEN
++#include <asm/bios_ebda.h>
++
++#define BIOS_LOWMEM_KILOBYTES 0x413
++
++/*
++ * The BIOS places the EBDA/XBDA at the top of conventional
++ * memory, and usually decreases the reported amount of
++ * conventional memory (int 0x12) too. This also contains a
++ * workaround for Dell systems that neglect to reserve EBDA.
++ * The same workaround also avoids a problem with the AMD768MPX
++ * chipset: reserve a page before VGA to prevent PCI prefetch
++ * into it (errata #56). Usually the page is reserved anyways,
++ * unless you have no PS/2 mouse plugged in.
++ */
++void __init reserve_ebda_region(void)
++{
++      unsigned int lowmem, ebda_addr;
++
++      /* To determine the position of the EBDA and the */
++      /* end of conventional memory, we need to look at */
++      /* the BIOS data area. In a paravirtual environment */
++      /* that area is absent. We'll just have to assume */
++      /* that the paravirt case can handle memory setup */
++      /* correctly, without our help. */
++      if (paravirt_enabled())
++              return;
++
++      /* end of low (conventional) memory */
++      lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
++      lowmem <<= 10;
++
++      /* start of EBDA area */
++      ebda_addr = get_bios_ebda();
++
++      /* Fixup: bios puts an EBDA in the top 64K segment */
++      /* of conventional memory, but does not adjust lowmem. */
++      if ((lowmem - ebda_addr) <= 0x10000)
++              lowmem = ebda_addr;
++
++      /* Fixup: bios does not report an EBDA at all. */
++      /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
++      if ((ebda_addr == 0) && (lowmem >= 0x9f000))
++              lowmem = 0x9f000;
++
++      /* Paranoia: should never happen, but... */
++      if ((lowmem == 0) || (lowmem >= 0x100000))
++              lowmem = 0x9f000;
++
++      /* reserve all memory between lowmem and the 1MB mark */
++      memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
++}
++#else /* CONFIG_XEN */
++#include <linux/module.h>
++#include <asm/fixmap.h>
++#include <asm/pgtable.h>
++#include <asm/sections.h>
++#include <xen/interface/callback.h>
++#include <xen/interface/memory.h>
++
++extern void hypervisor_callback(void);
++extern void failsafe_callback(void);
++extern void nmi(void);
++
++#ifdef CONFIG_X86_64
++#include <asm/proto.h>
++#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
++#else
++#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
++#endif
++
++unsigned long __initdata xen_initrd_start;
++
++unsigned long *__read_mostly machine_to_phys_mapping =
++      (void *)MACH2PHYS_VIRT_START;
++EXPORT_SYMBOL(machine_to_phys_mapping);
++unsigned int __read_mostly machine_to_phys_order;
++EXPORT_SYMBOL(machine_to_phys_order);
++
++void __init xen_start_kernel(void)
++{
++      unsigned int i;
++      struct xen_machphys_mapping mapping;
++      unsigned long machine_to_phys_nr_ents;
++
++      xen_setup_features();
++
++      if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
++              machine_to_phys_mapping = (unsigned long *)mapping.v_start;
++              machine_to_phys_nr_ents = mapping.max_mfn + 1;
++      } else
++              machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
++      while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
++              machine_to_phys_order++;
++
++      if (!xen_feature(XENFEAT_auto_translated_physmap))
++              phys_to_machine_mapping =
++                      (unsigned long *)xen_start_info->mfn_list;
++
++      WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
++                                   VMASST_TYPE_writable_pagetables));
++
++      memblock_init();
++      memblock_x86_reserve_range(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
++                                 __pa(xen_start_info->pt_base)
++                                 + (xen_start_info->nr_pt_frames
++                                    << PAGE_SHIFT),
++                                 "Xen provided");
++
++#ifdef CONFIG_X86_32
++{
++      extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
++      unsigned long addr;
++
++      /* Do an early initialization of the fixmap area */
++      make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
++      addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
++      set_pmd(pmd_offset(pud_offset(swapper_pg_dir + pgd_index(addr),
++                                    addr),
++                         addr),
++              __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
++}
++#else
++      x86_configure_nx();
++      xen_init_pt();
++#endif
++
++#define __FIXADDR_TOP (-PAGE_SIZE)
++#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
++#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
++                      != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
++      FIX_BUG_ON(SHARED_INFO);
++      FIX_BUG_ON(ISAMAP_BEGIN);
++      FIX_BUG_ON(ISAMAP_END);
++#undef pmd_index
++#undef __FIXADDR_TOP
++
++      /* Switch to the real shared_info page, and clear the dummy page. */
++      set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
++      HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
++      clear_page(empty_zero_page);
++
++      setup_vcpu_info(0);
++
++      /* Set up mapping of lowest 1MB of physical memory. */
++      for (i = 0; i < NR_FIX_ISAMAPS; i++)
++              if (is_initial_xendomain())
++                      set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
++              else
++                      __set_fixmap(FIX_ISAMAP_BEGIN - i,
++                                   virt_to_machine(empty_zero_page),
++                                   PAGE_KERNEL_RO);
++
++      if (is_initial_xendomain())
++              pci_request_acs();
++}
++
++void __init xen_arch_setup(void)
++{
++      int ret;
++      static const struct callback_register __initconst event = {
++              .type = CALLBACKTYPE_event,
++              .address = CALLBACK_ADDR(hypervisor_callback)
++      };
++      static const struct callback_register __initconst failsafe = {
++              .type = CALLBACKTYPE_failsafe,
++              .address = CALLBACK_ADDR(failsafe_callback)
++      };
++#ifdef CONFIG_X86_64
++      static const struct callback_register __initconst syscall = {
++              .type = CALLBACKTYPE_syscall,
++              .address = CALLBACK_ADDR(system_call)
++      };
++#endif
++      static const struct callback_register __initconst nmi_cb = {
++              .type = CALLBACKTYPE_nmi,
++              .address = CALLBACK_ADDR(nmi)
++      };
++
++      ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
++      if (ret == 0)
++              ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
++#ifdef CONFIG_X86_64
++      if (ret == 0)
++              ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
++#endif
++#if CONFIG_XEN_COMPAT <= 0x030002
++#ifdef CONFIG_X86_32
++      if (ret == -ENOSYS)
++              ret = HYPERVISOR_set_callbacks(
++                      event.address.cs, event.address.eip,
++                      failsafe.address.cs, failsafe.address.eip);
++#else
++              ret = HYPERVISOR_set_callbacks(
++                      event.address,
++                      failsafe.address,
++                      syscall.address);
++#endif
++#endif
++      BUG_ON(ret);
++
++      ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (ret == -ENOSYS) {
++              static struct xennmi_callback __initdata cb = {
++                      .handler_address = (unsigned long)nmi
++              };
++
++              HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
++      }
++#endif
++}
++#endif /* CONFIG_XEN */
diff --cc arch/x86/kernel/head32-xen.c

index 0000000,0000000..4e8f127

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/head32-xen.c
@@@ -1,0 -1,0 +1,106 @@@
++/*
++ *  linux/arch/i386/kernel/head32.c -- prepare to run common code
++ *
++ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
++ *  Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
++ */
++
++#include <linux/init.h>
++#include <linux/start_kernel.h>
++#include <linux/mm.h>
++#include <linux/memblock.h>
++
++#include <asm/setup.h>
++#include <asm/sections.h>
++#include <asm/e820.h>
++#include <asm/trampoline.h>
++#include <asm/apic.h>
++#include <asm/io_apic.h>
++#include <asm/tlbflush.h>
++
++static void __init i386_default_early_setup(void)
++{
++      /* Initialize 32bit specific setup functions */
++      if (is_initial_xendomain())
++              x86_init.resources.probe_roms = probe_roms;
++      x86_init.resources.reserve_resources = i386_reserve_resources;
++#ifndef CONFIG_XEN
++      x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
++
++      reserve_ebda_region();
++#endif
++}
++
++void __init i386_start_kernel(void)
++{
++#ifdef CONFIG_XEN
++      struct xen_platform_parameters pp;
++
++      WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
++                                   VMASST_TYPE_4gb_segments));
++
++      init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
++
++      if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
++              hypervisor_virt_start = pp.virt_start;
++              reserve_top_address(0UL - pp.virt_start);
++      }
++
++      BUG_ON(pte_index(hypervisor_virt_start));
++#endif
++
++      memblock_init();
++
++      memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
++
++#ifndef CONFIG_XEN
++#ifdef CONFIG_BLK_DEV_INITRD
++      /* Reserve INITRD */
++      if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
++              /* Assume only end is not page aligned */
++              u64 ramdisk_image = boot_params.hdr.ramdisk_image;
++              u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
++              u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
++              memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
++      }
++#endif
++
++      /* Call the subarch specific early setup function */
++      switch (boot_params.hdr.hardware_subarch) {
++      case X86_SUBARCH_MRST:
++              x86_mrst_early_setup();
++              break;
++      case X86_SUBARCH_CE4100:
++              x86_ce4100_early_setup();
++              break;
++      default:
++              i386_default_early_setup();
++              break;
++      }
++#else
++#ifdef CONFIG_BLK_DEV_INITRD
++      BUG_ON(xen_start_info->flags & SIF_MOD_START_PFN);
++      if (xen_start_info->mod_start)
++              xen_initrd_start = __pa(xen_start_info->mod_start);
++#endif
++      {
++              int max_cmdline;
++
++              if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
++                      max_cmdline = COMMAND_LINE_SIZE;
++              memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
++              boot_command_line[max_cmdline-1] = '\0';
++      }
++
++      i386_default_early_setup();
++      xen_start_kernel();
++#endif
++
++      /*
++       * At this point everything still needed from the boot loader
++       * or BIOS or kernel text should be early reserved or marked not
++       * RAM in e820. All other memory is free game.
++       */
++
++      start_kernel();
++}
diff --cc arch/x86/kernel/head64-xen.c

index 0000000,0000000..04451f6

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/head64-xen.c
@@@ -1,0 -1,0 +1,149 @@@
++/*
++ *  prepare to run common code
++ *
++ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
++ *
++ *  Jun Nakajima <jun.nakajima@intel.com>
++ *    Modified for Xen.
++ */
++
++#include <linux/init.h>
++#include <linux/linkage.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/string.h>
++#include <linux/percpu.h>
++#include <linux/start_kernel.h>
++#include <linux/io.h>
++#include <linux/memblock.h>
++
++#include <asm/processor.h>
++#include <asm/proto.h>
++#include <asm/smp.h>
++#include <asm/setup.h>
++#include <asm/desc.h>
++#include <asm/pgtable.h>
++#include <asm/tlbflush.h>
++#include <asm/sections.h>
++#include <asm/kdebug.h>
++#include <asm/e820.h>
++#include <asm/trampoline.h>
++#include <asm/bios_ebda.h>
++
++#ifndef CONFIG_XEN
++static void __init zap_identity_mappings(void)
++{
++      pgd_t *pgd = pgd_offset_k(0UL);
++      pgd_clear(pgd);
++      __flush_tlb_all();
++}
++
++/* Don't add a printk in there. printk relies on the PDA which is not initialized 
++   yet. */
++static void __init clear_bss(void)
++{
++      memset(__bss_start, 0,
++             (unsigned long) __bss_stop - (unsigned long) __bss_start);
++}
++#endif
++
++static void __init copy_bootdata(char *real_mode_data)
++{
++#ifndef CONFIG_XEN
++      char * command_line;
++
++      memcpy(&boot_params, real_mode_data, sizeof boot_params);
++      if (boot_params.hdr.cmd_line_ptr) {
++              command_line = __va(boot_params.hdr.cmd_line_ptr);
++              memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
++      }
++#else
++      int max_cmdline;
++      
++      if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
++              max_cmdline = COMMAND_LINE_SIZE;
++      memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
++      boot_command_line[max_cmdline-1] = '\0';
++#endif
++}
++
++#include <xen/interface/memory.h>
++
++void __init x86_64_start_kernel(char * real_mode_data)
++{
++      /*
++       * Build-time sanity checks on the kernel image and module
++       * area mappings. (these are purely build-time and produce no code)
++       */
++      BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
++      BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
++      BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
++      BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
++      BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
++      BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
++      BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
++                              (__START_KERNEL & PGDIR_MASK)));
++      BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
++
++      xen_start_info = (struct start_info *)real_mode_data;
++      xen_start_kernel();
++
++#ifndef CONFIG_XEN
++      /* clear bss before set_intr_gate with early_idt_handler */
++      clear_bss();
++
++      /* Make NULL pointers segfault */
++      zap_identity_mappings();
++
++      for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
++#ifdef CONFIG_EARLY_PRINTK
++              set_intr_gate(i, &early_idt_handlers[i]);
++#else
++              set_intr_gate(i, early_idt_handler);
++#endif
++      }
++      load_idt((const struct desc_ptr *)&idt_descr);
++#endif
++
++      if (console_loglevel == 10)
++              early_printk("Kernel alive\n");
++
++      xen_switch_pt();
++
++      x86_64_start_reservations(real_mode_data);
++}
++
++void __init x86_64_start_reservations(char *real_mode_data)
++{
++      copy_bootdata(__va(real_mode_data));
++
++      memblock_init();
++
++      memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
++
++#ifdef CONFIG_BLK_DEV_INITRD
++      /* Reserve INITRD if needed. */
++      if (xen_start_info->flags & SIF_MOD_START_PFN) {
++              reserve_pfn_range(xen_start_info->mod_start,
++                                PFN_UP(xen_start_info->mod_len),
++                                "RAMDISK");
++              xen_initrd_start = xen_start_info->mod_start << PAGE_SHIFT;
++      } else if (xen_start_info->mod_start)
++              xen_initrd_start = __pa(xen_start_info->mod_start);
++#endif
++
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              xen_start_info->mfn_list = ~0UL;
++      else if (xen_start_info->mfn_list < __START_KERNEL_map)
++              reserve_pfn_range(xen_start_info->first_p2m_pfn,
++                                xen_start_info->nr_p2m_frames,
++                                "INITP2M");
++
++      /*
++       * At this point everything still needed from the boot loader
++       * or BIOS or kernel text should be early reserved or marked not
++       * RAM in e820. All other memory is free game.
++       */
++
++      start_kernel();
++}
diff --cc arch/x86/kernel/head_32-xen.S

index 0000000,0000000..d2f49a8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/head_32-xen.S
@@@ -1,0 -1,0 +1,196 @@@
++
++
++.text
++#include <linux/elfnote.h>
++#include <linux/threads.h>
++#include <linux/init.h>
++#include <linux/linkage.h>
++#include <asm/segment.h>
++#include <asm/page_types.h>
++#include <asm/pgtable_types.h>
++#include <asm/cache.h>
++#include <asm/thread_info.h>
++#include <asm/asm-offsets.h>
++#include <asm/boot.h>
++#include <asm/dwarf2.h>
++#include <asm/percpu.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/elfnote.h>
++
++/*
++ * References to members of the new_cpu_data structure.
++ */
++
++#define X86           new_cpu_data+CPUINFO_x86
++#define X86_VENDOR    new_cpu_data+CPUINFO_x86_vendor
++#define X86_MODEL     new_cpu_data+CPUINFO_x86_model
++#define X86_MASK      new_cpu_data+CPUINFO_x86_mask
++#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
++#define X86_CPUID     new_cpu_data+CPUINFO_cpuid_level
++#define X86_CAPABILITY        new_cpu_data+CPUINFO_x86_capability
++#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
++
++__HEAD
++#define VIRT_ENTRY_OFFSET 0x0
++.org VIRT_ENTRY_OFFSET
++ENTRY(startup_32)
++      movl %esi,xen_start_info
++      cld
++
++      /* Set up the stack pointer */
++      movl $(init_thread_union+THREAD_SIZE),%esp
++
++      /* get vendor info */
++      xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
++      XEN_CPUID
++      movl %eax,X86_CPUID             # save CPUID level
++      movl %ebx,X86_VENDOR_ID         # lo 4 chars
++      movl %edx,X86_VENDOR_ID+4       # next 4 chars
++      movl %ecx,X86_VENDOR_ID+8       # last 4 chars
++
++      movl $1,%eax            # Use the CPUID instruction to get CPU type
++      XEN_CPUID
++      movb %al,%cl            # save reg for future use
++      andb $0x0f,%ah          # mask processor family
++      movb %ah,X86
++      andb $0xf0,%al          # mask model
++      shrb $4,%al
++      movb %al,X86_MODEL
++      andb $0x0f,%cl          # mask mask revision
++      movb %cl,X86_MASK
++      movl %edx,X86_CAPABILITY
++
++#ifdef CONFIG_CC_STACKPROTECTOR
++      /*
++       * The linker can't handle this by relocation.  Manually set
++       * base address in stack canary segment descriptor.
++       */
++      movl $gdt_page,%eax
++      movl $stack_canary,%ecx
++      movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
++      shrl $16, %ecx
++      movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
++      movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
++#endif
++
++      # %esi still points to start_info, and no registers
++      # need to be preserved.
++
++      movl XEN_START_mfn_list(%esi), %ebx
++      movl $(gdt_page - __PAGE_OFFSET), %eax
++      shrl $PAGE_SHIFT, %eax
++      movl (%ebx,%eax,4), %ecx
++      pushl %ecx                      # frame number for set_gdt below
++
++      xorl %esi, %esi
++      xorl %edx, %edx
++      shldl $PAGE_SHIFT, %ecx, %edx
++      shll $PAGE_SHIFT, %ecx
++      orl $_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY, %ecx
++      movl $gdt_page, %ebx
++      movl $__HYPERVISOR_update_va_mapping, %eax
++      int $0x82
++
++      movl $(PAGE_SIZE / 8), %ecx
++      movl %esp, %ebx
++      movl $__HYPERVISOR_set_gdt, %eax
++      int $0x82
++
++      popl %ecx
++
++      movl $(__KERNEL_PERCPU), %eax
++      movl %eax,%fs                   # set this cpu's percpu
++
++      movl $(__KERNEL_STACK_CANARY),%eax
++      movl %eax,%gs
++
++      cld                     # gcc2 wants the direction flag cleared at all times
++
++      pushl $0                # fake return address for unwinder
++      jmp i386_start_kernel
++
++#define HYPERCALL_PAGE_OFFSET 0x1000
++.org HYPERCALL_PAGE_OFFSET
++ENTRY(hypercall_page)
++      CFI_STARTPROC
++.skip 0x1000
++      CFI_ENDPROC
++
++/*
++ * BSS section
++ */
++__PAGE_ALIGNED_BSS
++      .align PAGE_SIZE
++ENTRY(swapper_pg_fixmap)
++      .fill 1024,4,0
++ENTRY(empty_zero_page)
++      .fill 4096,1,0
++
++/*
++ * This starts the data section.
++ */
++.data
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++/*
++ * __xen_guest information
++ */
++.macro utoa value
++ .if (\value) < 0 || (\value) >= 0x10
++      utoa (((\value)>>4)&0x0fffffff)
++ .endif
++ .if ((\value) & 0xf) < 10
++  .byte '0' + ((\value) & 0xf)
++ .else
++  .byte 'A' + ((\value) & 0xf) - 10
++ .endif
++.endm
++
++.section __xen_guest
++      .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
++      .ascii  ",XEN_VER=xen-3.0"
++      .ascii  ",VIRT_BASE=0x"
++              utoa __PAGE_OFFSET
++      .ascii  ",ELF_PADDR_OFFSET=0x"
++              utoa __PAGE_OFFSET
++      .ascii  ",VIRT_ENTRY=0x"
++              utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET)
++      .ascii  ",HYPERCALL_PAGE=0x"
++              utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
++      .ascii  ",FEATURES=writable_page_tables"
++      .ascii           "|writable_descriptor_tables"
++      .ascii           "|auto_translated_physmap"
++      .ascii           "|pae_pgdir_above_4gb"
++      .ascii           "|supervisor_mode_kernel"
++#ifdef CONFIG_X86_PAE
++      .ascii  ",PAE=yes[extended-cr3]"
++#else
++      .ascii  ",PAE=no"
++#endif
++      .ascii  ",LOADER=generic"
++      .byte   0
++#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
++
++
++      ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
++      ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
++      ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
++      ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long __PAGE_OFFSET)
++#if CONFIG_XEN_COMPAT <= 0x030002
++      ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .long __PAGE_OFFSET)
++#else
++      ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .long 0)
++#endif
++      ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long startup_32)
++      ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
++      ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   .long HYPERVISOR_VIRT_START)
++      ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel")
++#ifdef CONFIG_X86_PAE
++      ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
++      ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad _PAGE_PRESENT, _PAGE_PRESENT)
++#else
++      ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
++      ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .long _PAGE_PRESENT, _PAGE_PRESENT)
++#endif
++      ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
++      ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
diff --cc arch/x86/kernel/head_64-xen.S

index 0000000,0000000..daf9671

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/head_64-xen.S
@@@ -1,0 -1,0 +1,154 @@@
++/*
++ *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
++ *
++ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
++ *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
++ *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
++ *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
++ *  Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
++ *  Jun Nakajima <jun.nakajima@intel.com>
++ *    Modified for Xen                                
++ */
++
++
++#include <linux/linkage.h>
++#include <linux/threads.h>
++#include <linux/init.h>
++#include <linux/elfnote.h>
++#include <asm/segment.h>
++#include <asm/page.h>
++#include <asm/pgtable.h>
++#include <asm/msr.h>
++#include <asm/cache.h>
++#include <asm/dwarf2.h>
++#include <asm/percpu.h>
++#include <xen/interface/elfnote.h>
++
++      __HEAD
++      .code64
++      .globl startup_64
++startup_64:
++      movq $(init_thread_union+THREAD_SIZE-8),%rsp
++
++      /* rsi is pointer to startup info structure.
++         pass it to C */
++      movq %rsi,%rdi
++
++      /* Set up %gs.
++       *
++       * The base of %gs always points to the bottom of the irqstack
++       * union.  If the stack protector canary is enabled, it is
++       * located at %gs:40.  Note that, on SMP, the boot cpu uses
++       * init data section till per cpu areas are set up.
++       */
++      movl    $MSR_GS_BASE,%ecx
++      movq    $INIT_PER_CPU_VAR(irq_stack_union),%rax
++      movq    %rax,%rdx
++      shrq    $32,%rdx
++      wrmsr
++
++      pushq $0                # fake return address
++      jmp x86_64_start_kernel
++
++#define NEXT_PAGE(name) \
++      .balign PAGE_SIZE; \
++ENTRY(name)
++
++      __PAGE_ALIGNED_BSS
++NEXT_PAGE(init_level4_pgt)
++      .fill   512,8,0
++
++NEXT_PAGE(level3_kernel_pgt)
++      .fill   512,8,0
++
++        /*
++         * This is used for vsyscall area mapping as we have a different
++         * level4 page table for user.
++         */
++NEXT_PAGE(level3_user_pgt)
++        .fill 512,8,0
++
++NEXT_PAGE(level2_fixmap_pgt)
++      .fill   512,8,0
++
++NEXT_PAGE(level1_fixmap_pgt)
++      .fill   512,8,0
++
++      .previous
++NEXT_PAGE(hypercall_page)
++      phys_hypercall_page = . - .head.text
++      CFI_STARTPROC
++      .rept 0x1000 / 0x20
++      .skip 1 /* push %rcx */
++      CFI_ADJUST_CFA_OFFSET   8
++      CFI_REL_OFFSET  rcx,0
++      .skip 2 /* push %r11 */
++      CFI_ADJUST_CFA_OFFSET   8
++      CFI_REL_OFFSET  rcx,0
++      .skip 5 /* mov $#,%eax */
++      .skip 2 /* syscall */
++      .skip 2 /* pop %r11 */
++      CFI_ADJUST_CFA_OFFSET -8
++      CFI_RESTORE r11
++      .skip 1 /* pop %rcx */
++      CFI_ADJUST_CFA_OFFSET -8
++      CFI_RESTORE rcx
++      .align 0x20,0 /* ret */
++      .endr
++      CFI_ENDPROC
++
++#undef NEXT_PAGE
++
++      __PAGE_ALIGNED_BSS
++      .align PAGE_SIZE
++ENTRY(empty_zero_page)
++      .skip PAGE_SIZE
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++/*
++ * __xen_guest information
++ */
++.macro utoh value
++ i = 64
++ .rept 16
++  i = i - 4
++  .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf)
++ .endr
++.endm
++
++.section __xen_guest
++      .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
++      .ascii  ",XEN_VER=xen-3.0"
++      .ascii  ",VIRT_BASE=0x"
++              utoh __START_KERNEL_map
++      .ascii  ",ELF_PADDR_OFFSET=0x"
++              utoh __START_KERNEL_map
++      .ascii  ",VIRT_ENTRY=0x"
++              utoh (__START_KERNEL_map + __PHYSICAL_START)
++      .ascii  ",HYPERCALL_PAGE=0x"
++              utoh (phys_hypercall_page >> PAGE_SHIFT)
++      .ascii  ",FEATURES=writable_page_tables"
++      .ascii           "|writable_descriptor_tables"
++      .ascii           "|auto_translated_physmap"
++      .ascii           "|supervisor_mode_kernel"
++      .ascii  ",LOADER=generic"
++      .byte   0
++#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
++      
++      ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
++      ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
++      ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
++      ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .quad __START_KERNEL_map)
++#if CONFIG_XEN_COMPAT <= 0x030002
++      ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad __START_KERNEL_map)
++#else
++      ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad 0)
++#endif
++      ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad startup_64)
++      ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
++      ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad _PAGE_PRESENT, _PAGE_PRESENT)
++      ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN,  .long 1)
++      ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad VMEMMAP_START)
++      ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel")
++      ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
++      ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
diff --cc arch/x86/kernel/init_task.c

index 43e9ccf,43e9ccf..c50f863
--- 1/arch/x86/kernel/init_task.c
--- 2/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@@ -31,6 -31,6 +31,7 @@@ union thread_union init_thread_union __
   struct task_struct init_task = INIT_TASK(init_task);
   EXPORT_SYMBOL(init_task);
   
++#ifndef CONFIG_X86_NO_TSS
   /*
    * per-CPU TSS segments. Threads are completely 'soft' on Linux,
    * no more per-task TSS's. The TSS size is kept cacheline-aligned
@@@ -39,4 -39,4 +40,4 @@@
    * on exact cacheline boundaries, to eliminate cacheline ping-pong.
    */
   DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
--
++#endif
diff --cc arch/x86/kernel/ioport-xen.c

index 0000000,0000000..3cb400e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/ioport-xen.c
@@@ -1,0 -1,0 +1,84 @@@
++/*
++ * This contains the io-permission bitmap code - written by obz, with changes
++ * by Linus. 32/64 bits code unification by Miguel Botón.
++ */
++
++#include <linux/sched.h>
++#include <linux/kernel.h>
++#include <linux/capability.h>
++#include <linux/errno.h>
++#include <linux/types.h>
++#include <linux/ioport.h>
++#include <linux/smp.h>
++#include <linux/stddef.h>
++#include <linux/slab.h>
++#include <linux/thread_info.h>
++#include <linux/syscalls.h>
++#include <linux/bitmap.h>
++#include <asm/syscalls.h>
++#include <xen/interface/physdev.h>
++
++/*
++ * this changes the io permissions bitmap in the current task.
++ */
++asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
++{
++      struct thread_struct *t = &current->thread;
++      struct physdev_set_iobitmap set_iobitmap;
++
++      if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
++              return -EINVAL;
++      if (turn_on && !capable(CAP_SYS_RAWIO))
++              return -EPERM;
++
++      /*
++       * If it's the first ioperm() call in this thread's lifetime, set the
++       * IO bitmap up. ioperm() is much less timing critical than clone(),
++       * this is why we delay this operation until now:
++       */
++      if (!t->io_bitmap_ptr) {
++              unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
++
++              if (!bitmap)
++                      return -ENOMEM;
++
++              memset(bitmap, 0xff, IO_BITMAP_BYTES);
++              t->io_bitmap_ptr = bitmap;
++              set_thread_flag(TIF_IO_BITMAP);
++
++              set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
++              set_iobitmap.nr_ports = IO_BITMAP_BITS;
++              WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
++                                            &set_iobitmap));
++      }
++
++      if (turn_on)
++              bitmap_clear(t->io_bitmap_ptr, from, num);
++      else
++              bitmap_set(t->io_bitmap_ptr, from, num);
++
++      return 0;
++}
++
++/*
++ * sys_iopl has to be used when you want to access the IO ports
++ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
++ * you'd need 8kB of bitmaps/process, which is a bit excessive.
++ */
++long sys_iopl(unsigned int level, struct pt_regs *regs)
++{
++      struct thread_struct *t = &current->thread;
++      unsigned int old = t->iopl >> 12;
++
++      if (level > 3)
++              return -EINVAL;
++      /* Trying to gain more privileges? */
++      if (level > old) {
++              if (!capable(CAP_SYS_RAWIO))
++                      return -EPERM;
++      }
++      t->iopl = level << 12;
++      set_iopl_mask(t->iopl);
++
++      return 0;
++}
diff --cc arch/x86/kernel/irq-xen.c

index 0000000,0000000..22636cf

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/irq-xen.c
@@@ -1,0 -1,0 +1,340 @@@
++/*
++ * Common interrupt code for 32 and 64 bit
++ */
++#include <linux/cpu.h>
++#include <linux/interrupt.h>
++#include <linux/kernel_stat.h>
++#include <linux/of.h>
++#include <linux/seq_file.h>
++#include <linux/smp.h>
++#include <linux/ftrace.h>
++#include <linux/delay.h>
++
++#include <asm/apic.h>
++#include <asm/io_apic.h>
++#include <asm/irq.h>
++#include <asm/idle.h>
++#include <asm/mce.h>
++#include <asm/hw_irq.h>
++
++#ifndef CONFIG_XEN
++atomic_t irq_err_count;
++
++/* Function pointer for generic interrupt vector handling */
++void (*x86_platform_ipi_callback)(void) = NULL;
++#endif
++
++/*
++ * 'what should we do if we get a hw irq event on an illegal vector'.
++ * each architecture has to answer this themselves.
++ */
++void ack_bad_irq(unsigned int irq)
++{
++      if (printk_ratelimit())
++              pr_err("unexpected IRQ trap at vector %02x\n", irq);
++
++#ifndef CONFIG_XEN
++      /*
++       * Currently unexpected vectors happen only on SMP and APIC.
++       * We _must_ ack these because every local APIC has only N
++       * irq slots per priority level, and a 'hanging, unacked' IRQ
++       * holds up an irq slot - in excessive cases (when multiple
++       * unexpected vectors occur) that might lock up the APIC
++       * completely.
++       * But only ack when the APIC is enabled -AK
++       */
++      ack_APIC_irq();
++#endif
++}
++
++#define irq_stats(x)          (&per_cpu(irq_stat, x))
++/*
++ * /proc/interrupts printing for arch specific interrupts
++ */
++int arch_show_interrupts(struct seq_file *p, int prec)
++{
++      int j;
++
++      seq_printf(p, "%*s: ", prec, "NMI");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
++      seq_printf(p, "  Non-maskable interrupts\n");
++#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
++      seq_printf(p, "%*s: ", prec, "LOC");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
++      seq_printf(p, "  Local timer interrupts\n");
++
++      seq_printf(p, "%*s: ", prec, "SPU");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
++      seq_printf(p, "  Spurious interrupts\n");
++      seq_printf(p, "%*s: ", prec, "PMI");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
++      seq_printf(p, "  Performance monitoring interrupts\n");
++      seq_printf(p, "%*s: ", prec, "IWI");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
++      seq_printf(p, "  IRQ work interrupts\n");
++#endif
++#ifndef CONFIG_XEN
++      if (x86_platform_ipi_callback) {
++              seq_printf(p, "%*s: ", prec, "PLT");
++              for_each_online_cpu(j)
++                      seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
++              seq_printf(p, "  Platform interrupts\n");
++      }
++#endif
++#ifdef CONFIG_SMP
++      seq_printf(p, "%*s: ", prec, "RES");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
++      seq_printf(p, "  Rescheduling interrupts\n");
++      seq_printf(p, "%*s: ", prec, "CAL");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
++      seq_printf(p, "  Function call interrupts\n");
++#ifndef CONFIG_XEN
++      seq_printf(p, "%*s: ", prec, "TLB");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
++      seq_printf(p, "  TLB shootdowns\n");
++#else
++      seq_printf(p, "%*s: ", prec, "LCK");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", irq_stats(j)->irq_lock_count);
++      seq_printf(p, "  Spinlock wakeups\n");
++#endif
++#endif
++#ifdef CONFIG_X86_THERMAL_VECTOR
++      seq_printf(p, "%*s: ", prec, "TRM");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
++      seq_printf(p, "  Thermal event interrupts\n");
++#endif
++#ifdef CONFIG_X86_MCE_THRESHOLD
++      seq_printf(p, "%*s: ", prec, "THR");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
++      seq_printf(p, "  Threshold APIC interrupts\n");
++#endif
++#ifdef CONFIG_X86_MCE
++      seq_printf(p, "%*s: ", prec, "MCE");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
++      seq_printf(p, "  Machine check exceptions\n");
++      seq_printf(p, "%*s: ", prec, "MCP");
++      for_each_online_cpu(j)
++              seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
++      seq_printf(p, "  Machine check polls\n");
++#endif
++#ifndef CONFIG_XEN
++      seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
++#if defined(CONFIG_X86_IO_APIC)
++      seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
++#endif
++#endif
++      return 0;
++}
++
++/*
++ * /proc/stat helpers
++ */
++u64 arch_irq_stat_cpu(unsigned int cpu)
++{
++      u64 sum = irq_stats(cpu)->__nmi_count;
++
++#ifdef CONFIG_X86_LOCAL_APIC
++      sum += irq_stats(cpu)->apic_timer_irqs;
++      sum += irq_stats(cpu)->irq_spurious_count;
++      sum += irq_stats(cpu)->apic_perf_irqs;
++      sum += irq_stats(cpu)->apic_irq_work_irqs;
++#endif
++#ifndef CONFIG_XEN
++      if (x86_platform_ipi_callback)
++              sum += irq_stats(cpu)->x86_platform_ipis;
++#endif
++#ifdef CONFIG_SMP
++      sum += irq_stats(cpu)->irq_resched_count;
++      sum += irq_stats(cpu)->irq_call_count;
++#ifndef CONFIG_XEN
++      sum += irq_stats(cpu)->irq_tlb_count;
++#else
++      sum += irq_stats(cpu)->irq_lock_count;
++#endif
++#endif
++#ifdef CONFIG_X86_THERMAL_VECTOR
++      sum += irq_stats(cpu)->irq_thermal_count;
++#endif
++#ifdef CONFIG_X86_MCE_THRESHOLD
++      sum += irq_stats(cpu)->irq_threshold_count;
++#endif
++#ifdef CONFIG_X86_MCE
++      sum += per_cpu(mce_exception_count, cpu);
++      sum += per_cpu(mce_poll_count, cpu);
++#endif
++      return sum;
++}
++
++u64 arch_irq_stat(void)
++{
++#ifndef CONFIG_XEN
++      u64 sum = atomic_read(&irq_err_count);
++
++#ifdef CONFIG_X86_IO_APIC
++      sum += atomic_read(&irq_mis_count);
++#endif
++      return sum;
++#else
++      return 0;
++#endif
++}
++
++
++#ifndef CONFIG_XEN
++/*
++ * do_IRQ handles all normal device IRQ's (the special
++ * SMP cross-CPU interrupts have their own specific
++ * handlers).
++ */
++unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
++{
++      struct pt_regs *old_regs = set_irq_regs(regs);
++
++      /* high bit used in ret_from_ code  */
++      unsigned vector = ~regs->orig_ax;
++      unsigned irq;
++
++      exit_idle();
++      irq_enter();
++
++      irq = __this_cpu_read(vector_irq[vector]);
++
++      if (!handle_irq(irq, regs)) {
++              ack_APIC_irq();
++
++              if (printk_ratelimit())
++                      pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
++                              __func__, smp_processor_id(), vector, irq);
++      }
++
++      irq_exit();
++
++      set_irq_regs(old_regs);
++      return 1;
++}
++
++/*
++ * Handler for X86_PLATFORM_IPI_VECTOR.
++ */
++void smp_x86_platform_ipi(struct pt_regs *regs)
++{
++      struct pt_regs *old_regs = set_irq_regs(regs);
++
++      ack_APIC_irq();
++
++      exit_idle();
++
++      irq_enter();
++
++      inc_irq_stat(x86_platform_ipis);
++
++      if (x86_platform_ipi_callback)
++              x86_platform_ipi_callback();
++
++      irq_exit();
++
++      set_irq_regs(old_regs);
++}
++#endif
++
++#ifdef CONFIG_HOTPLUG_CPU
++#include <xen/evtchn.h>
++/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
++void fixup_irqs(void)
++{
++      unsigned int irq;
++      static int warned;
++      struct irq_desc *desc;
++      struct irq_data *data;
++      struct irq_chip *chip;
++      static DECLARE_BITMAP(irqs_used, NR_IRQS);
++
++      for_each_irq_desc(irq, desc) {
++              int break_affinity = 0;
++              int set_affinity = 1;
++              const struct cpumask *affinity;
++
++              if (!desc)
++                      continue;
++              if (irq == 2)
++                      continue;
++
++              /* interrupt's are disabled at this point */
++              raw_spin_lock(&desc->lock);
++
++              data = irq_desc_get_irq_data(desc);
++              affinity = data->affinity;
++              if (!irq_has_action(irq) ||
++                  irqd_is_per_cpu(data) ||
++                  cpumask_subset(affinity, cpu_online_mask)) {
++                      raw_spin_unlock(&desc->lock);
++                      continue;
++              }
++
++              if (cpumask_test_cpu(smp_processor_id(), affinity))
++                      __set_bit(irq, irqs_used);
++
++              if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
++                      break_affinity = 1;
++                      affinity = cpu_all_mask;
++              }
++
++              chip = irq_data_get_irq_chip(data);
++              if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
++                      chip->irq_mask(data);
++
++              if (chip->irq_set_affinity)
++                      chip->irq_set_affinity(data, affinity, true);
++              else if (data->chip != &no_irq_chip && !(warned++))
++                      set_affinity = 0;
++
++              if (!irqd_can_move_in_process_context(data) && chip->irq_unmask)
++                      chip->irq_unmask(data);
++
++              raw_spin_unlock(&desc->lock);
++
++              if (break_affinity && set_affinity)
++                      /*printk("Broke affinity for irq %i\n", irq)*/;
++              else if (!set_affinity)
++                      printk("Cannot set affinity for irq %i\n", irq);
++      }
++
++      /*
++       * We can remove mdelay() and then send spuriuous interrupts to
++       * new cpu targets for all the irqs that were handled previously by
++       * this cpu. While it works, I have seen spurious interrupt messages
++       * (nothing wrong but still...).
++       *
++       * So for now, retain mdelay(1) and check the IRR and then send those
++       * interrupts to new targets as this cpu is already offlined...
++       */
++      mdelay(1);
++
++      for_each_irq_desc(irq, desc) {
++              if (!__test_and_clear_bit(irq, irqs_used))
++                      continue;
++
++              if (xen_test_irq_pending(irq)) {
++                      desc = irq_to_desc(irq);
++                      data = irq_desc_get_irq_data(desc);
++                      chip = irq_data_get_irq_chip(data);
++                      raw_spin_lock(&desc->lock);
++                      if (chip->irq_retrigger)
++                              chip->irq_retrigger(data);
++                      raw_spin_unlock(&desc->lock);
++              }
++      }
++}
++#endif
diff --cc arch/x86/kernel/irq_work-xen.c

index 0000000,0000000..851414e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/irq_work-xen.c
@@@ -1,0 -1,0 +1,21 @@@
++/*
++ * x86/Xen specific code for irq_work
++ */
++
++#include <linux/kernel.h>
++#include <linux/irq_work.h>
++#include <linux/hardirq.h>
++#include <asm/ipi.h>
++
++#ifdef CONFIG_SMP
++void smp_irq_work_interrupt(struct pt_regs *regs)
++{
++      inc_irq_stat(apic_irq_work_irqs);
++      irq_work_run();
++}
++
++void arch_irq_work_raise(void)
++{
++      xen_send_IPI_self(IRQ_WORK_VECTOR);
++}
++#endif
diff --cc arch/x86/kernel/ldt-xen.c

index 0000000,0000000..5d56e69

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/ldt-xen.c
@@@ -1,0 -1,0 +1,272 @@@
++/*
++ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
++ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
++ * Copyright (C) 2002 Andi Kleen
++ *
++ * This handles calls from both 32bit and 64bit mode.
++ */
++
++#include <linux/errno.h>
++#include <linux/gfp.h>
++#include <linux/sched.h>
++#include <linux/string.h>
++#include <linux/mm.h>
++#include <linux/smp.h>
++#include <linux/vmalloc.h>
++#include <linux/uaccess.h>
++
++#include <asm/system.h>
++#include <asm/ldt.h>
++#include <asm/desc.h>
++#include <asm/mmu_context.h>
++#include <asm/syscalls.h>
++
++#ifdef CONFIG_SMP
++static void flush_ldt(void *current_mm)
++{
++      if (current->active_mm == current_mm)
++              load_LDT(&current->active_mm->context);
++}
++#endif
++
++static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
++{
++      void *oldldt, *newldt;
++      int oldsize;
++
++      if (mincount <= pc->size)
++              return 0;
++      oldsize = pc->size;
++      mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
++                      (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
++      if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
++              newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
++      else
++              newldt = (void *)__get_free_page(GFP_KERNEL);
++
++      if (!newldt)
++              return -ENOMEM;
++
++      if (oldsize)
++              memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
++      oldldt = pc->ldt;
++      memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
++             (mincount - oldsize) * LDT_ENTRY_SIZE);
++
++#ifdef CONFIG_X86_64
++      /* CHECKME: Do we really need this ? */
++      wmb();
++#endif
++      pc->ldt = newldt;
++      wmb();
++      pc->size = mincount;
++      wmb();
++
++      if (reload) {
++#ifdef CONFIG_SMP
++              preempt_disable();
++#endif
++              make_pages_readonly(newldt,
++                                  (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
++                                  XENFEAT_writable_descriptor_tables);
++              load_LDT(pc);
++#ifdef CONFIG_SMP
++              if (!cpumask_equal(mm_cpumask(current->mm),
++                                 cpumask_of(smp_processor_id())))
++                      smp_call_function(flush_ldt, current->mm, 1);
++              preempt_enable();
++#endif
++      }
++      if (oldsize) {
++              make_pages_writable(oldldt,
++                                  (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
++                                  XENFEAT_writable_descriptor_tables);
++              if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
++                      vfree(oldldt);
++              else
++                      put_page(virt_to_page(oldldt));
++      }
++      return 0;
++}
++
++static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
++{
++      int err = alloc_ldt(new, old->size, 0);
++
++      if (err < 0)
++              return err;
++      memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
++      make_pages_readonly(new->ldt,
++                          (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
++                          XENFEAT_writable_descriptor_tables);
++      return 0;
++}
++
++/*
++ * we do not have to muck with descriptors here, that is
++ * done in switch_mm() as needed.
++ */
++int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
++{
++      struct mm_struct *old_mm;
++      int retval = 0;
++
++      memset(&mm->context, 0, sizeof(mm->context));
++      mutex_init(&mm->context.lock);
++      old_mm = current->mm;
++      if (old_mm)
++              mm->context.vdso = old_mm->context.vdso;
++      if (old_mm && old_mm->context.size > 0) {
++              mutex_lock(&old_mm->context.lock);
++              retval = copy_ldt(&mm->context, &old_mm->context);
++              mutex_unlock(&old_mm->context.lock);
++      }
++      return retval;
++}
++
++/*
++ * No need to lock the MM as we are the last user
++ *
++ * 64bit: Don't touch the LDT register - we're already in the next thread.
++ */
++void destroy_context(struct mm_struct *mm)
++{
++      if (mm->context.size) {
++              /* CHECKME: Can this ever happen ? */
++              if (mm == current->active_mm)
++                      clear_LDT();
++              make_pages_writable(mm->context.ldt,
++                                  (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
++                                  XENFEAT_writable_descriptor_tables);
++              if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
++                      vfree(mm->context.ldt);
++              else
++                      put_page(virt_to_page(mm->context.ldt));
++              mm->context.size = 0;
++      }
++}
++
++static int read_ldt(void __user *ptr, unsigned long bytecount)
++{
++      int err;
++      unsigned long size;
++      struct mm_struct *mm = current->mm;
++
++      if (!mm->context.size)
++              return 0;
++      if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
++              bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
++
++      mutex_lock(&mm->context.lock);
++      size = mm->context.size * LDT_ENTRY_SIZE;
++      if (size > bytecount)
++              size = bytecount;
++
++      err = 0;
++      if (copy_to_user(ptr, mm->context.ldt, size))
++              err = -EFAULT;
++      mutex_unlock(&mm->context.lock);
++      if (err < 0)
++              goto error_return;
++      if (size != bytecount) {
++              /* zero-fill the rest */
++              if (clear_user(ptr + size, bytecount - size) != 0) {
++                      err = -EFAULT;
++                      goto error_return;
++              }
++      }
++      return bytecount;
++error_return:
++      return err;
++}
++
++static int read_default_ldt(void __user *ptr, unsigned long bytecount)
++{
++      /* CHECKME: Can we use _one_ random number ? */
++#ifdef CONFIG_X86_32
++      unsigned long size = 5 * sizeof(struct desc_struct);
++#else
++      unsigned long size = 128;
++#endif
++      if (bytecount > size)
++              bytecount = size;
++      if (clear_user(ptr, bytecount))
++              return -EFAULT;
++      return bytecount;
++}
++
++static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
++{
++      struct mm_struct *mm = current->mm;
++      struct desc_struct ldt;
++      int error;
++      struct user_desc ldt_info;
++
++      error = -EINVAL;
++      if (bytecount != sizeof(ldt_info))
++              goto out;
++      error = -EFAULT;
++      if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
++              goto out;
++
++      error = -EINVAL;
++      if (ldt_info.entry_number >= LDT_ENTRIES)
++              goto out;
++      if (ldt_info.contents == 3) {
++              if (oldmode)
++                      goto out;
++              if (ldt_info.seg_not_present == 0)
++                      goto out;
++      }
++
++      mutex_lock(&mm->context.lock);
++      if (ldt_info.entry_number >= mm->context.size) {
++              error = alloc_ldt(&current->mm->context,
++                                ldt_info.entry_number + 1, 1);
++              if (error < 0)
++                      goto out_unlock;
++      }
++
++      /* Allow LDTs to be cleared by the user. */
++      if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
++              if (oldmode || LDT_empty(&ldt_info)) {
++                      memset(&ldt, 0, sizeof(ldt));
++                      goto install;
++              }
++      }
++
++      fill_ldt(&ldt, &ldt_info);
++      if (oldmode)
++              ldt.avl = 0;
++
++      /* Install the new entry ...  */
++install:
++      error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
++
++out_unlock:
++      mutex_unlock(&mm->context.lock);
++out:
++      return error;
++}
++
++asmlinkage int sys_modify_ldt(int func, void __user *ptr,
++                            unsigned long bytecount)
++{
++      int ret = -ENOSYS;
++
++      switch (func) {
++      case 0:
++              ret = read_ldt(ptr, bytecount);
++              break;
++      case 1:
++              ret = write_ldt(ptr, bytecount, 1);
++              break;
++      case 2:
++              ret = read_default_ldt(ptr, bytecount);
++              break;
++      case 0x11:
++              ret = write_ldt(ptr, bytecount, 0);
++              break;
++      }
++      return ret;
++}
diff --cc arch/x86/kernel/machine_kexec_32.c

index a3fa43b,a3fa43b..129f539
--- 1/arch/x86/kernel/machine_kexec_32.c
--- 2/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@@ -27,47 -27,47 +27,9 @@@
   #include <asm/cacheflush.h>
   #include <asm/debugreg.h>
   
--static void set_idt(void *newidt, __u16 limit)
--{
--      struct desc_ptr curidt;
--
--      /* ia32 supports unaliged loads & stores */
--      curidt.size    = limit;
--      curidt.address = (unsigned long)newidt;
--
--      load_idt(&curidt);
--}
--
--
--static void set_gdt(void *newgdt, __u16 limit)
--{
--      struct desc_ptr curgdt;
--
--      /* ia32 supports unaligned loads & stores */
--      curgdt.size    = limit;
--      curgdt.address = (unsigned long)newgdt;
--
--      load_gdt(&curgdt);
--}
--
--static void load_segments(void)
--{
--#define __STR(X) #X
--#define STR(X) __STR(X)
--
--      __asm__ __volatile__ (
--              "\tljmp $"STR(__KERNEL_CS)",$1f\n"
--              "\t1:\n"
--              "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
--              "\tmovl %%eax,%%ds\n"
--              "\tmovl %%eax,%%es\n"
--              "\tmovl %%eax,%%fs\n"
--              "\tmovl %%eax,%%gs\n"
--              "\tmovl %%eax,%%ss\n"
--              : : : "eax", "memory");
--#undef STR
--#undef __STR
--}
++#ifdef CONFIG_XEN
++#include <xen/interface/kexec.h>
++#endif
   
   static void machine_kexec_free_page_tables(struct kimage *image)
   {
@@@ -84,6 -84,6 +46,17 @@@ static int machine_kexec_alloc_page_tab
   {
         image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
   #ifdef CONFIG_X86_PAE
++#ifdef CONFIG_XEN /* machine address must fit into xki->page_list[PA_PGD] */
++      if (image->arch.pgd) {
++              struct page *pg = virt_to_page(image->arch.pgd);
++
++              if (xen_limit_pages_to_max_mfn(pg, 0, BITS_PER_LONG) < 0) {
++                      image->arch.pgd = NULL;
++                      __free_page(pg);
++                      return -ENOMEM;
++              }
++      }
++#endif
         image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
         image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
   #endif
@@@ -139,6 -139,6 +112,51 @@@ static void machine_kexec_prepare_page_
                 __pa(control_page), __pa(control_page));
   }
   
++#ifdef CONFIG_XEN
++
++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
++
++#if PAGES_NR > KEXEC_XEN_NO_PAGES
++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
++#endif
++
++#if PA_CONTROL_PAGE != 0
++#error PA_CONTROL_PAGE is non zero - Xen support will break
++#endif
++
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
++{
++      void *control_page;
++
++      memset(xki->page_list, 0, sizeof(xki->page_list));
++
++      control_page = page_address(image->control_code_page);
++      memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++      xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
++      xki->page_list[PA_PGD] = __ma(image->arch.pgd);
++
++      if (image->type == KEXEC_TYPE_DEFAULT)
++              xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
++}
++
++int __init machine_kexec_setup_resources(struct resource *hypervisor,
++                                       struct resource *phys_cpus,
++                                       int nr_phys_cpus)
++{
++      int k;
++
++      /* The per-cpu crash note resources belong to the hypervisor resource */
++      for (k = 0; k < nr_phys_cpus; k++)
++              request_resource(hypervisor, phys_cpus + k);
++
++      return 0;
++}
++
++void machine_kexec_register_resources(struct resource *res) { ; }
++
++#endif /* CONFIG_XEN */
++
   /*
    * A architecture hook called to validate the
    * proposed image and prepare the control pages
@@@ -176,6 -176,6 +194,7 @@@ void machine_kexec_cleanup(struct kimag
         machine_kexec_free_page_tables(image);
   }
   
++#ifndef CONFIG_XEN
   /*
    * Do not allocate memory (or fail in any way) in machine_kexec().
    * We are past the point of no return, committed to rebooting now.
@@@ -228,24 -228,24 +247,6 @@@ void machine_kexec(struct kimage *image
                 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
                                                 << PAGE_SHIFT);
   
--      /*
--       * The segment registers are funny things, they have both a
--       * visible and an invisible part.  Whenever the visible part is
--       * set to a specific selector, the invisible part is loaded
--       * with from a table in memory.  At no other time is the
--       * descriptor table in memory accessed.
--       *
--       * I take advantage of this here by force loading the
--       * segments, before I zap the gdt with an invalid value.
--       */
--      load_segments();
--      /*
--       * The gdt & idt are now invalid.
--       * If you want to load them you must set up your own idt & gdt.
--       */
--      set_gdt(phys_to_virt(0), 0);
--      set_idt(phys_to_virt(0), 0);
--
         /* now call it */
         image->start = relocate_kernel_ptr((unsigned long)image->head,
                                            (unsigned long)page_list,
@@@ -259,6 -259,6 +260,7 @@@
   
         __ftrace_enabled_restore(save_ftrace_enabled);
   }
++#endif
   
   void arch_crash_save_vmcoreinfo(void)
   {
diff --cc arch/x86/kernel/machine_kexec_64.c

index b3ea9db,b3ea9db..803efa0
--- 1/arch/x86/kernel/machine_kexec_64.c
--- 2/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@@ -21,6 -21,6 +21,112 @@@
   #include <asm/mmu_context.h>
   #include <asm/debugreg.h>
   
++#ifdef CONFIG_XEN
++
++/* In the case of Xen, override hypervisor functions to be able to create
++ * a regular identity mapping page table...
++ */
++
++#include <xen/interface/kexec.h>
++#include <xen/interface/memory.h>
++
++#define x__pmd(x) ((pmd_t) { (x) } )
++#define x__pud(x) ((pud_t) { (x) } )
++#define x__pgd(x) ((pgd_t) { (x) } )
++
++#define x_pmd_val(x)   ((x).pmd)
++#define x_pud_val(x)   ((x).pud)
++#define x_pgd_val(x)   ((x).pgd)
++
++static inline void x_set_pmd(pmd_t *dst, pmd_t val)
++{
++      x_pmd_val(*dst) = x_pmd_val(val);
++}
++
++static inline void x_set_pud(pud_t *dst, pud_t val)
++{
++      x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
++}
++
++static inline void x_pud_clear (pud_t *pud)
++{
++      x_pud_val(*pud) = 0;
++}
++
++static inline void x_set_pgd(pgd_t *dst, pgd_t val)
++{
++      x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
++}
++
++static inline void x_pgd_clear (pgd_t * pgd)
++{
++      x_pgd_val(*pgd) = 0;
++}
++
++#define X__PAGE_KERNEL_LARGE_EXEC \
++         _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
++#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
++
++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
++
++#if PAGES_NR > KEXEC_XEN_NO_PAGES
++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
++#endif
++
++#if PA_CONTROL_PAGE != 0
++#error PA_CONTROL_PAGE is non zero - Xen support will break
++#endif
++
++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
++{
++      void *control_page;
++      void *table_page;
++
++      memset(xki->page_list, 0, sizeof(xki->page_list));
++
++      control_page = page_address(image->control_code_page) + PAGE_SIZE;
++      memcpy(control_page, relocate_kernel, PAGE_SIZE);
++
++      table_page = page_address(image->control_code_page);
++
++      xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
++      xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
++
++      if (image->type == KEXEC_TYPE_DEFAULT)
++              xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
++}
++
++int __init machine_kexec_setup_resources(struct resource *hypervisor,
++                                       struct resource *phys_cpus,
++                                       int nr_phys_cpus)
++{
++      int k;
++
++      /* The per-cpu crash note resources belong to the hypervisor resource */
++      for (k = 0; k < nr_phys_cpus; k++)
++              request_resource(hypervisor, phys_cpus + k);
++
++      return 0;
++}
++
++#else /* CONFIG_XEN */
++
++#define x__pmd(x) __pmd(x)
++#define x__pud(x) __pud(x)
++#define x__pgd(x) __pgd(x)
++
++#define x_set_pmd(x, y) set_pmd(x, y)
++#define x_set_pud(x, y) set_pud(x, y)
++#define x_set_pgd(x, y) set_pgd(x, y)
++
++#define x_pud_clear(x) pud_clear(x)
++#define x_pgd_clear(x) pgd_clear(x)
++
++#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
++#define X_KERNPG_TABLE _KERNPG_TABLE
++
++#endif /* CONFIG_XEN */
++
   static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
                                 unsigned long addr)
   {
@@@ -50,7 -50,7 +156,7 @@@
         }
         pmd = pmd_offset(pud, addr);
         if (!pmd_present(*pmd))
--              set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
++              x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
         result = 0;
   out:
         return result;
@@@ -63,7 -63,7 +169,7 @@@ static void init_level2_page(pmd_t *lev
         addr &= PAGE_MASK;
         end_addr = addr + PUD_SIZE;
         while (addr < end_addr) {
--              set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
++              x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
                 addr += PMD_SIZE;
         }
   }
@@@ -88,12 -88,12 +194,12 @@@ static int init_level3_page(struct kima
                 }
                 level2p = (pmd_t *)page_address(page);
                 init_level2_page(level2p, addr);
--              set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
++              x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
                 addr += PUD_SIZE;
         }
         /* clear the unused entries */
         while (addr < end_addr) {
--              pud_clear(level3p++);
++              x_pud_clear(level3p++);
                 addr += PUD_SIZE;
         }
   out:
@@@ -123,12 -123,12 +229,12 @@@ static int init_level4_page(struct kima
                 result = init_level3_page(image, level3p, addr, last_addr);
                 if (result)
                         goto out;
--              set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
++              x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
                 addr += PGDIR_SIZE;
         }
         /* clear the unused entries */
         while (addr < end_addr) {
--              pgd_clear(level4p++);
++              x_pgd_clear(level4p++);
                 addr += PGDIR_SIZE;
         }
   out:
@@@ -189,8 -189,8 +295,14 @@@ static int init_pgtable(struct kimage *
   {
         pgd_t *level4p;
         int result;
++      unsigned long x_max_pfn = max_pfn;
++
++#ifdef CONFIG_XEN
++      x_max_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
++#endif
++
         level4p = (pgd_t *)__va(start_pgtable);
--      result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
++      result = init_level4_page(image, level4p, 0, x_max_pfn << PAGE_SHIFT);
         if (result)
                 return result;
         /*
@@@ -203,47 -203,47 +315,6 @@@
         return init_transition_pgtable(image, level4p);
   }
   
--static void set_idt(void *newidt, u16 limit)
--{
--      struct desc_ptr curidt;
--
--      /* x86-64 supports unaliged loads & stores */
--      curidt.size    = limit;
--      curidt.address = (unsigned long)newidt;
--
--      __asm__ __volatile__ (
--              "lidtq %0\n"
--              : : "m" (curidt)
--              );
--};
--
--
--static void set_gdt(void *newgdt, u16 limit)
--{
--      struct desc_ptr curgdt;
--
--      /* x86-64 supports unaligned loads & stores */
--      curgdt.size    = limit;
--      curgdt.address = (unsigned long)newgdt;
--
--      __asm__ __volatile__ (
--              "lgdtq %0\n"
--              : : "m" (curgdt)
--              );
--};
--
--static void load_segments(void)
--{
--      __asm__ __volatile__ (
--              "\tmovl %0,%%ds\n"
--              "\tmovl %0,%%es\n"
--              "\tmovl %0,%%ss\n"
--              "\tmovl %0,%%fs\n"
--              "\tmovl %0,%%gs\n"
--              : : "a" (__KERNEL_DS) : "memory"
--              );
--}
--
   int machine_kexec_prepare(struct kimage *image)
   {
         unsigned long start_pgtable;
@@@ -265,6 -265,6 +336,7 @@@ void machine_kexec_cleanup(struct kimag
         free_transition_pgtable(image);
   }
   
++#ifndef CONFIG_XEN
   /*
    * Do not allocate memory (or fail in any way) in machine_kexec().
    * We are past the point of no return, committed to rebooting now.
@@@ -311,24 -311,24 +383,6 @@@ void machine_kexec(struct kimage *image
                 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
                                                 << PAGE_SHIFT);
   
--      /*
--       * The segment registers are funny things, they have both a
--       * visible and an invisible part.  Whenever the visible part is
--       * set to a specific selector, the invisible part is loaded
--       * with from a table in memory.  At no other time is the
--       * descriptor table in memory accessed.
--       *
--       * I take advantage of this here by force loading the
--       * segments, before I zap the gdt with an invalid value.
--       */
--      load_segments();
--      /*
--       * The gdt & idt are now invalid.
--       * If you want to load them you must set up your own idt & gdt.
--       */
--      set_gdt(phys_to_virt(0), 0);
--      set_idt(phys_to_virt(0), 0);
--
         /* now call it */
         image->start = relocate_kernel((unsigned long)image->head,
                                        (unsigned long)page_list,
@@@ -342,10 -342,10 +396,13 @@@
   
         __ftrace_enabled_restore(save_ftrace_enabled);
   }
++#endif
   
   void arch_crash_save_vmcoreinfo(void)
   {
++#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
         VMCOREINFO_SYMBOL(phys_base);
++#endif
         VMCOREINFO_SYMBOL(init_level4_pgt);
   
   #ifdef CONFIG_NUMA
diff --cc arch/x86/kernel/microcode_core-xen.c

index 0000000,0000000..e33ad82

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/microcode_core-xen.c
@@@ -1,0 -1,0 +1,226 @@@
++/*
++ *    Intel CPU Microcode Update Driver for Linux
++ *
++ *    Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
++ *                  2006      Shaohua Li <shaohua.li@intel.com>
++ *
++ *    This driver allows to upgrade microcode on Intel processors
++ *    belonging to IA-32 family - PentiumPro, Pentium II,
++ *    Pentium III, Xeon, Pentium 4, etc.
++ *
++ *    Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
++ *    Software Developer's Manual
++ *    Order Number 253668 or free download from:
++ *
++ *    http://developer.intel.com/Assets/PDF/manual/253668.pdf
++ *
++ *    For more information, go to http://www.urbanmyth.org/microcode
++ *
++ *    This program is free software; you can redistribute it and/or
++ *    modify it under the terms of the GNU General Public License
++ *    as published by the Free Software Foundation; either version
++ *    2 of the License, or (at your option) any later version.
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include <linux/platform_device.h>
++#include <linux/miscdevice.h>
++#include <linux/capability.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/mutex.h>
++#include <linux/cpu.h>
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/firmware.h>
++#include <linux/uaccess.h>
++#include <linux/vmalloc.h>
++
++#include <asm/microcode.h>
++#include <asm/processor.h>
++
++MODULE_DESCRIPTION("Microcode Update Driver");
++MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
++MODULE_LICENSE("GPL");
++
++static int verbose;
++module_param(verbose, int, 0644);
++
++#define MICROCODE_VERSION     "2.00-xen"
++
++/*
++ * Synchronization.
++ *
++ * All non cpu-hotplug-callback call sites use:
++ *
++ * - microcode_mutex to synchronize with each other;
++ * - get/put_online_cpus() to synchronize with
++ *   the cpu-hotplug-callback call sites.
++ *
++ * We guarantee that only a single cpu is being
++ * updated at any particular moment of time.
++ */
++static DEFINE_MUTEX(microcode_mutex);
++
++#ifdef CONFIG_MICROCODE_OLD_INTERFACE
++static int do_microcode_update(const void __user *ubuf, size_t len)
++{
++      int err;
++      void *kbuf;
++
++      kbuf = vmalloc(len);
++      if (!kbuf)
++              return -ENOMEM;
++
++      if (copy_from_user(kbuf, ubuf, len) == 0) {
++              struct xen_platform_op op;
++
++              op.cmd = XENPF_microcode_update;
++              set_xen_guest_handle(op.u.microcode.data, kbuf);
++              op.u.microcode.length = len;
++              err = HYPERVISOR_platform_op(&op);
++      } else
++              err = -EFAULT;
++
++      vfree(kbuf);
++
++      return err;
++}
++
++static int microcode_open(struct inode *inode, struct file *file)
++{
++      return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
++}
++
++static ssize_t microcode_write(struct file *file, const char __user *buf,
++                             size_t len, loff_t *ppos)
++{
++      ssize_t ret = -EINVAL;
++
++      if ((len >> PAGE_SHIFT) > totalram_pages) {
++              pr_err("too much data (max %ld pages)\n", totalram_pages);
++              return ret;
++      }
++
++      mutex_lock(&microcode_mutex);
++
++      if (do_microcode_update(buf, len) == 0)
++              ret = (ssize_t)len;
++
++      mutex_unlock(&microcode_mutex);
++
++      return ret;
++}
++
++static const struct file_operations microcode_fops = {
++      .owner                  = THIS_MODULE,
++      .write                  = microcode_write,
++      .open                   = microcode_open,
++      .llseek         = no_llseek,
++};
++
++static struct miscdevice microcode_dev = {
++      .minor                  = MICROCODE_MINOR,
++      .name                   = "microcode",
++      .nodename               = "cpu/microcode",
++      .fops                   = &microcode_fops,
++};
++
++static int __init microcode_dev_init(void)
++{
++      int error;
++
++      error = misc_register(&microcode_dev);
++      if (error) {
++              pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
++              return error;
++      }
++
++      return 0;
++}
++
++static void microcode_dev_exit(void)
++{
++      misc_deregister(&microcode_dev);
++}
++
++MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
++MODULE_ALIAS("devname:cpu/microcode");
++#else
++#define microcode_dev_init()  0
++#define microcode_dev_exit()  do { } while (0)
++#endif
++
++/* fake device for request_firmware */
++static struct platform_device *microcode_pdev;
++
++static int request_microcode(const char *name)
++{
++      const struct firmware *firmware;
++      int error;
++      struct xen_platform_op op;
++
++      error = request_firmware(&firmware, name, &microcode_pdev->dev);
++      if (error) {
++              pr_debug("microcode: data file %s load failed\n", name);
++              return error;
++      }
++
++      op.cmd = XENPF_microcode_update;
++      set_xen_guest_handle(op.u.microcode.data, firmware->data);
++      op.u.microcode.length = firmware->size;
++      error = HYPERVISOR_platform_op(&op);
++
++      release_firmware(firmware);
++
++      if (error)
++              pr_debug("ucode load failed\n");
++
++      return error;
++}
++
++static int __init microcode_init(void)
++{
++      const struct cpuinfo_x86 *c = &boot_cpu_data;
++      char buf[32];
++      const char *fw_name = buf;
++      int error;
++
++      if (c->x86_vendor == X86_VENDOR_INTEL)
++              sprintf(buf, "intel-ucode/%02x-%02x-%02x",
++                      c->x86, c->x86_model, c->x86_mask);
++      else if (c->x86_vendor == X86_VENDOR_AMD)
++              fw_name = "amd-ucode/microcode_amd.bin";
++      else {
++              pr_err("no support for this CPU vendor\n");
++              return -ENODEV;
++      }
++
++      microcode_pdev = platform_device_register_simple("microcode", -1,
++                                                       NULL, 0);
++      if (IS_ERR(microcode_pdev)) {
++              return PTR_ERR(microcode_pdev);
++      }
++
++      error = microcode_dev_init();
++      if (error)
++              return error;
++
++      request_microcode(fw_name);
++
++      pr_info("Microcode Update Driver: v" MICROCODE_VERSION
++              " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
++
++      return 0;
++}
++module_init(microcode_init);
++
++static void __exit microcode_exit(void)
++{
++      microcode_dev_exit();
++      platform_device_unregister(microcode_pdev);
++
++      pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
++}
++module_exit(microcode_exit);
diff --cc arch/x86/kernel/mmconf-fam10h_64.c

index ac861b8,ac861b8..2c88c6a
--- 1/arch/x86/kernel/mmconf-fam10h_64.c
--- 2/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@@ -205,12 -205,12 +205,20 @@@ void __cpuinit fam10h_check_enable_mmcf
                 return;
         }
   
++#ifndef CONFIG_XEN
         printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
         val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
              (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT));
         val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
                FAM10H_MMIO_CONF_ENABLE;
         wrmsrl(address, val);
++#else
++      if ((val & ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
++           (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT)))
++          != (fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
++              FAM10H_MMIO_CONF_ENABLE))
++              pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
++#endif
   }
   
   static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
diff --cc arch/x86/kernel/mpparse-xen.c

index 0000000,0000000..56aa3ff

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/mpparse-xen.c
@@@ -1,0 -1,0 +1,968 @@@
++/*
++ *    Intel Multiprocessor Specification 1.1 and 1.4
++ *    compliant MP-table parsing routines.
++ *
++ *    (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
++ *    (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
++ *      (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
++ */
++
++#include <linux/mm.h>
++#include <linux/init.h>
++#include <linux/delay.h>
++#include <linux/bootmem.h>
++#include <linux/memblock.h>
++#include <linux/kernel_stat.h>
++#include <linux/mc146818rtc.h>
++#include <linux/bitops.h>
++#include <linux/acpi.h>
++#include <linux/module.h>
++#include <linux/smp.h>
++#include <linux/pci.h>
++
++#include <asm/mtrr.h>
++#include <asm/mpspec.h>
++#include <asm/pgalloc.h>
++#include <asm/io_apic.h>
++#include <asm/proto.h>
++#include <asm/bios_ebda.h>
++#include <asm/e820.h>
++#include <asm/trampoline.h>
++#include <asm/setup.h>
++#include <asm/smp.h>
++
++#include <asm/apic.h>
++
++static void *_bus_to_virt(unsigned long ma)
++{
++      return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma);
++}
++
++/*
++ * Checksum an MP configuration block.
++ */
++
++static int __init mpf_checksum(unsigned char *mp, int len)
++{
++      int sum = 0;
++
++      while (len--)
++              sum += *mp++;
++
++      return sum & 0xFF;
++}
++
++#ifndef CONFIG_XEN
++int __init default_mpc_apic_id(struct mpc_cpu *m)
++{
++      return m->apicid;
++}
++#endif
++
++static void __init MP_processor_info(struct mpc_cpu *m)
++{
++#ifndef CONFIG_XEN
++      int apicid;
++      char *bootup_cpu = "";
++
++      if (!(m->cpuflag & CPU_ENABLED)) {
++              disabled_cpus++;
++              return;
++      }
++
++      apicid = x86_init.mpparse.mpc_apic_id(m);
++
++      if (m->cpuflag & CPU_BOOTPROCESSOR) {
++              bootup_cpu = " (Bootup-CPU)";
++              boot_cpu_physical_apicid = m->apicid;
++      }
++
++      printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
++      generic_processor_info(apicid, m->apicver);
++#else /* CONFIG_XEN */
++      num_processors++;
++#endif
++}
++
++#ifdef CONFIG_X86_IO_APIC
++void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
++{
++      memcpy(str, m->bustype, 6);
++      str[6] = 0;
++      apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
++}
++
++static void __init MP_bus_info(struct mpc_bus *m)
++{
++      char str[7];
++
++      x86_init.mpparse.mpc_oem_bus_info(m, str);
++
++#if MAX_MP_BUSSES < 256
++      if (m->busid >= MAX_MP_BUSSES) {
++              printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
++                     " is too large, max. supported is %d\n",
++                     m->busid, str, MAX_MP_BUSSES - 1);
++              return;
++      }
++#endif
++
++      if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
++              set_bit(m->busid, mp_bus_not_pci);
++#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
++              mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
++#endif
++      } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
++              if (x86_init.mpparse.mpc_oem_pci_bus)
++                      x86_init.mpparse.mpc_oem_pci_bus(m);
++
++              clear_bit(m->busid, mp_bus_not_pci);
++#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
++              mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
++      } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
++              mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
++      } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
++              mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
++#endif
++      } else
++              printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
++}
++
++static void __init MP_ioapic_info(struct mpc_ioapic *m)
++{
++      if (m->flags & MPC_APIC_USABLE)
++              mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
++}
++
++static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
++{
++      apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
++              " IRQ %02x, APIC ID %x, APIC INT %02x\n",
++              mp_irq->irqtype, mp_irq->irqflag & 3,
++              (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
++              mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
++}
++
++#else /* CONFIG_X86_IO_APIC */
++static inline void __init MP_bus_info(struct mpc_bus *m) {}
++static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
++#endif /* CONFIG_X86_IO_APIC */
++
++static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
++{
++      apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
++              " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
++              m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
++              m->srcbusirq, m->destapic, m->destapiclint);
++}
++
++/*
++ * Read/parse the MPC
++ */
++static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
++{
++
++      if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
++              printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
++                     mpc->signature[0], mpc->signature[1],
++                     mpc->signature[2], mpc->signature[3]);
++              return 0;
++      }
++      if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
++              printk(KERN_ERR "MPTABLE: checksum error!\n");
++              return 0;
++      }
++      if (mpc->spec != 0x01 && mpc->spec != 0x04) {
++              printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
++                     mpc->spec);
++              return 0;
++      }
++      if (!mpc->lapic) {
++              printk(KERN_ERR "MPTABLE: null local APIC address!\n");
++              return 0;
++      }
++      memcpy(oem, mpc->oem, 8);
++      oem[8] = 0;
++      printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
++
++      memcpy(str, mpc->productid, 12);
++      str[12] = 0;
++
++      printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
++
++#ifndef CONFIG_XEN
++      printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
++#endif
++
++      return 1;
++}
++
++static void skip_entry(unsigned char **ptr, int *count, int size)
++{
++      *ptr += size;
++      *count += size;
++}
++
++static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
++{
++      printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
++              "type %x\n", *mpt);
++      print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
++                      1, mpc, mpc->length, 1);
++}
++
++void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
++
++static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
++{
++      char str[16];
++      char oem[10];
++
++      int count = sizeof(*mpc);
++      unsigned char *mpt = ((unsigned char *)mpc) + count;
++
++      if (!smp_check_mpc(mpc, oem, str))
++              return 0;
++
++#ifndef CONFIG_XEN
++#ifdef CONFIG_X86_32
++      generic_mps_oem_check(mpc, oem, str);
++#endif
++      /* Initialize the lapic mapping */
++      if (!acpi_lapic)
++              register_lapic_address(mpc->lapic);
++#endif
++
++      if (early)
++              return 1;
++
++      if (mpc->oemptr)
++              x86_init.mpparse.smp_read_mpc_oem(mpc);
++
++      /*
++       *      Now process the configuration blocks.
++       */
++      x86_init.mpparse.mpc_record(0);
++
++      while (count < mpc->length) {
++              switch (*mpt) {
++              case MP_PROCESSOR:
++                      /* ACPI may have already provided this data */
++                      if (!acpi_lapic)
++                              MP_processor_info((struct mpc_cpu *)mpt);
++                      skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
++                      break;
++              case MP_BUS:
++                      MP_bus_info((struct mpc_bus *)mpt);
++                      skip_entry(&mpt, &count, sizeof(struct mpc_bus));
++                      break;
++              case MP_IOAPIC:
++                      MP_ioapic_info((struct mpc_ioapic *)mpt);
++                      skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
++                      break;
++              case MP_INTSRC:
++                      mp_save_irq((struct mpc_intsrc *)mpt);
++                      skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
++                      break;
++              case MP_LINTSRC:
++                      MP_lintsrc_info((struct mpc_lintsrc *)mpt);
++                      skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
++                      break;
++              default:
++                      /* wrong mptable */
++                      smp_dump_mptable(mpc, mpt);
++                      count = mpc->length;
++                      break;
++              }
++              x86_init.mpparse.mpc_record(1);
++      }
++
++      if (!num_processors)
++              printk(KERN_ERR "MPTABLE: no processors registered!\n");
++      return num_processors;
++}
++
++#ifdef CONFIG_X86_IO_APIC
++
++static int __init ELCR_trigger(unsigned int irq)
++{
++      unsigned int port;
++
++      port = 0x4d0 + (irq >> 3);
++      return (inb(port) >> (irq & 7)) & 1;
++}
++
++static void __init construct_default_ioirq_mptable(int mpc_default_type)
++{
++      struct mpc_intsrc intsrc;
++      int i;
++      int ELCR_fallback = 0;
++
++      intsrc.type = MP_INTSRC;
++      intsrc.irqflag = 0;     /* conforming */
++      intsrc.srcbus = 0;
++      intsrc.dstapic = mp_ioapics[0].apicid;
++
++      intsrc.irqtype = mp_INT;
++
++      /*
++       *  If true, we have an ISA/PCI system with no IRQ entries
++       *  in the MP table. To prevent the PCI interrupts from being set up
++       *  incorrectly, we try to use the ELCR. The sanity check to see if
++       *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
++       *  never be level sensitive, so we simply see if the ELCR agrees.
++       *  If it does, we assume it's valid.
++       */
++      if (mpc_default_type == 5) {
++              printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
++                     "falling back to ELCR\n");
++
++              if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
++                  ELCR_trigger(13))
++                      printk(KERN_ERR "ELCR contains invalid data... "
++                             "not using ELCR\n");
++              else {
++                      printk(KERN_INFO
++                             "Using ELCR to identify PCI interrupts\n");
++                      ELCR_fallback = 1;
++              }
++      }
++
++      for (i = 0; i < 16; i++) {
++              switch (mpc_default_type) {
++              case 2:
++                      if (i == 0 || i == 13)
++                              continue;       /* IRQ0 & IRQ13 not connected */
++                      /* fall through */
++              default:
++                      if (i == 2)
++                              continue;       /* IRQ2 is never connected */
++              }
++
++              if (ELCR_fallback) {
++                      /*
++                       *  If the ELCR indicates a level-sensitive interrupt, we
++                       *  copy that information over to the MP table in the
++                       *  irqflag field (level sensitive, active high polarity).
++                       */
++                      if (ELCR_trigger(i))
++                              intsrc.irqflag = 13;
++                      else
++                              intsrc.irqflag = 0;
++              }
++
++              intsrc.srcbusirq = i;
++              intsrc.dstirq = i ? i : 2;      /* IRQ0 to INTIN2 */
++              mp_save_irq(&intsrc);
++      }
++
++      intsrc.irqtype = mp_ExtINT;
++      intsrc.srcbusirq = 0;
++      intsrc.dstirq = 0;      /* 8259A to INTIN0 */
++      mp_save_irq(&intsrc);
++}
++
++
++static void __init construct_ioapic_table(int mpc_default_type)
++{
++      struct mpc_ioapic ioapic;
++      struct mpc_bus bus;
++
++      bus.type = MP_BUS;
++      bus.busid = 0;
++      switch (mpc_default_type) {
++      default:
++              printk(KERN_ERR "???\nUnknown standard configuration %d\n",
++                     mpc_default_type);
++              /* fall through */
++      case 1:
++      case 5:
++              memcpy(bus.bustype, "ISA   ", 6);
++              break;
++      case 2:
++      case 6:
++      case 3:
++              memcpy(bus.bustype, "EISA  ", 6);
++              break;
++      case 4:
++      case 7:
++              memcpy(bus.bustype, "MCA   ", 6);
++      }
++      MP_bus_info(&bus);
++      if (mpc_default_type > 4) {
++              bus.busid = 1;
++              memcpy(bus.bustype, "PCI   ", 6);
++              MP_bus_info(&bus);
++      }
++
++      ioapic.type     = MP_IOAPIC;
++      ioapic.apicid   = 2;
++      ioapic.apicver  = mpc_default_type > 4 ? 0x10 : 0x01;
++      ioapic.flags    = MPC_APIC_USABLE;
++      ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE;
++      MP_ioapic_info(&ioapic);
++
++      /*
++       * We set up most of the low 16 IO-APIC pins according to MPS rules.
++       */
++      construct_default_ioirq_mptable(mpc_default_type);
++}
++#else
++static inline void __init construct_ioapic_table(int mpc_default_type) { }
++#endif
++
++static inline void __init construct_default_ISA_mptable(int mpc_default_type)
++{
++      struct mpc_cpu processor;
++      struct mpc_lintsrc lintsrc;
++      int linttypes[2] = { mp_ExtINT, mp_NMI };
++      int i;
++
++#ifndef CONFIG_XEN
++      /*
++       * local APIC has default address
++       */
++      mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
++#endif
++
++      /*
++       * 2 CPUs, numbered 0 & 1.
++       */
++      processor.type = MP_PROCESSOR;
++      /* Either an integrated APIC or a discrete 82489DX. */
++      processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
++      processor.cpuflag = CPU_ENABLED;
++      processor.cpufeature = (boot_cpu_data.x86 << 8) |
++          (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
++      processor.featureflag = boot_cpu_data.x86_capability[0];
++      processor.reserved[0] = 0;
++      processor.reserved[1] = 0;
++      for (i = 0; i < 2; i++) {
++              processor.apicid = i;
++              MP_processor_info(&processor);
++      }
++
++      construct_ioapic_table(mpc_default_type);
++
++      lintsrc.type = MP_LINTSRC;
++      lintsrc.irqflag = 0;            /* conforming */
++      lintsrc.srcbusid = 0;
++      lintsrc.srcbusirq = 0;
++      lintsrc.destapic = MP_APIC_ALL;
++      for (i = 0; i < 2; i++) {
++              lintsrc.irqtype = linttypes[i];
++              lintsrc.destapiclint = i;
++              MP_lintsrc_info(&lintsrc);
++      }
++}
++
++static struct mpf_intel *mpf_found;
++
++static unsigned long __init get_mpc_size(unsigned long physptr)
++{
++      struct mpc_table *mpc;
++      unsigned long size;
++
++      mpc = early_ioremap(physptr, PAGE_SIZE);
++      size = mpc->length;
++      early_iounmap(mpc, PAGE_SIZE);
++      apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
++
++      return size;
++}
++
++static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
++{
++      struct mpc_table *mpc;
++      unsigned long size;
++
++      size = get_mpc_size(mpf->physptr);
++      mpc = early_ioremap(mpf->physptr, size);
++      /*
++       * Read the physical hardware table.  Anything here will
++       * override the defaults.
++       */
++      if (!smp_read_mpc(mpc, early)) {
++#ifdef CONFIG_X86_LOCAL_APIC
++              smp_found_config = 0;
++#endif
++              printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
++                      "... disabling SMP support. (tell your hw vendor)\n");
++              early_iounmap(mpc, size);
++              return -1;
++      }
++      early_iounmap(mpc, size);
++
++      if (early)
++              return -1;
++
++#ifdef CONFIG_X86_IO_APIC
++      /*
++       * If there are no explicit MP IRQ entries, then we are
++       * broken.  We set up most of the low 16 IO-APIC pins to
++       * ISA defaults and hope it will work.
++       */
++      if (!mp_irq_entries) {
++              struct mpc_bus bus;
++
++              printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
++                     "using default mptable. (tell your hw vendor)\n");
++
++              bus.type = MP_BUS;
++              bus.busid = 0;
++              memcpy(bus.bustype, "ISA   ", 6);
++              MP_bus_info(&bus);
++
++              construct_default_ioirq_mptable(0);
++      }
++#endif
++
++      return 0;
++}
++
++/*
++ * Scan the memory blocks for an SMP configuration block.
++ */
++void __init default_get_smp_config(unsigned int early)
++{
++      struct mpf_intel *mpf = mpf_found;
++
++      if (!mpf)
++              return;
++
++#ifdef CONFIG_XEN
++      BUG_ON(early);
++#define early 0
++#endif
++
++      if (acpi_lapic && early)
++              return;
++
++      /*
++       * MPS doesn't support hyperthreading, aka only have
++       * thread 0 apic id in MPS table
++       */
++      if (acpi_lapic && acpi_ioapic)
++              return;
++
++      printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
++             mpf->specification);
++#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
++      if (mpf->feature2 & (1 << 7)) {
++              printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
++              pic_mode = 1;
++      } else {
++              printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
++              pic_mode = 0;
++      }
++#endif
++      /*
++       * Now see if we need to read further.
++       */
++      if (mpf->feature1 != 0) {
++#ifndef CONFIG_XEN
++              if (early) {
++                      /*
++                       * local APIC has default address
++                       */
++                      mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
++                      return;
++              }
++#endif
++
++              printk(KERN_INFO "Default MP configuration #%d\n",
++                     mpf->feature1);
++              construct_default_ISA_mptable(mpf->feature1);
++
++      } else if (mpf->physptr) {
++              if (check_physptr(mpf, early))
++                      return;
++      } else
++              BUG();
++
++      if (!early)
++              printk(KERN_INFO "Processors: %d\n", num_processors);
++      /*
++       * Only use the first configuration found.
++       */
++#undef early
++}
++
++#ifndef CONFIG_XEN
++static void __init smp_reserve_memory(struct mpf_intel *mpf)
++{
++      unsigned long size = get_mpc_size(mpf->physptr);
++
++      memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
++}
++#endif
++
++static int __init smp_scan_config(unsigned long base, unsigned long length)
++{
++      unsigned int *bp = _bus_to_virt(base);
++      struct mpf_intel *mpf;
++#ifndef CONFIG_XEN
++      unsigned long mem;
++#endif
++
++      apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
++                      bp, length);
++      BUILD_BUG_ON(sizeof(*mpf) != 16);
++
++      while (length > 0) {
++              mpf = (struct mpf_intel *)bp;
++              if ((*bp == SMP_MAGIC_IDENT) &&
++                  (mpf->length == 1) &&
++                  !mpf_checksum((unsigned char *)bp, 16) &&
++                  ((mpf->specification == 1)
++                   || (mpf->specification == 4))) {
++#ifdef CONFIG_X86_LOCAL_APIC
++                      smp_found_config = 1;
++#endif
++                      mpf_found = mpf;
++
++#ifndef CONFIG_XEN
++                      printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
++                             mpf, (u64)virt_to_phys(mpf));
++
++                      mem = virt_to_phys(mpf);
++                      memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
++                      if (mpf->physptr)
++                              smp_reserve_memory(mpf);
++#else
++                      printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
++                             mpf, ((void *)bp - _bus_to_virt(base)) + base);
++#endif
++                      return 1;
++              }
++              bp += 4;
++              length -= 16;
++      }
++      return 0;
++}
++
++void __init default_find_smp_config(void)
++{
++#ifndef CONFIG_XEN
++      unsigned int address;
++#endif
++
++      /*
++       * FIXME: Linux assumes you have 640K of base ram..
++       * this continues the error...
++       *
++       * 1) Scan the bottom 1K for a signature
++       * 2) Scan the top 1K of base RAM
++       * 3) Scan the 64K of bios
++       */
++      if (smp_scan_config(0x0, 0x400) ||
++          smp_scan_config(639 * 0x400, 0x400) ||
++          smp_scan_config(0xF0000, 0x10000))
++              return;
++      /*
++       * If it is an SMP machine we should know now, unless the
++       * configuration is in an EISA/MCA bus machine with an
++       * extended bios data area.
++       *
++       * there is a real-mode segmented pointer pointing to the
++       * 4K EBDA area at 0x40E, calculate and scan it here.
++       *
++       * NOTE! There are Linux loaders that will corrupt the EBDA
++       * area, and as such this kind of SMP config may be less
++       * trustworthy, simply because the SMP table may have been
++       * stomped on during early boot. These loaders are buggy and
++       * should be fixed.
++       *
++       * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
++       */
++
++#ifndef CONFIG_XEN
++      address = get_bios_ebda();
++      if (address)
++              smp_scan_config(address, 0x400);
++#endif
++}
++
++#ifdef CONFIG_X86_IO_APIC
++static u8 __initdata irq_used[MAX_IRQ_SOURCES];
++
++static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)
++{
++      int i;
++
++      if (m->irqtype != mp_INT)
++              return 0;
++
++      if (m->irqflag != 0x0f)
++              return 0;
++
++      /* not legacy */
++
++      for (i = 0; i < mp_irq_entries; i++) {
++              if (mp_irqs[i].irqtype != mp_INT)
++                      continue;
++
++              if (mp_irqs[i].irqflag != 0x0f)
++                      continue;
++
++              if (mp_irqs[i].srcbus != m->srcbus)
++                      continue;
++              if (mp_irqs[i].srcbusirq != m->srcbusirq)
++                      continue;
++              if (irq_used[i]) {
++                      /* already claimed */
++                      return -2;
++              }
++              irq_used[i] = 1;
++              return i;
++      }
++
++      /* not found */
++      return -1;
++}
++
++#define SPARE_SLOT_NUM 20
++
++static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
++
++static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
++{
++      int i;
++
++      apic_printk(APIC_VERBOSE, "OLD ");
++      print_mp_irq_info(m);
++
++      i = get_MP_intsrc_index(m);
++      if (i > 0) {
++              memcpy(m, &mp_irqs[i], sizeof(*m));
++              apic_printk(APIC_VERBOSE, "NEW ");
++              print_mp_irq_info(&mp_irqs[i]);
++              return;
++      }
++      if (!i) {
++              /* legacy, do nothing */
++              return;
++      }
++      if (*nr_m_spare < SPARE_SLOT_NUM) {
++              /*
++               * not found (-1), or duplicated (-2) are invalid entries,
++               * we need to use the slot later
++               */
++              m_spare[*nr_m_spare] = m;
++              *nr_m_spare += 1;
++      }
++}
++
++static int
++check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
++{
++      int ret = 0;
++
++      if (!mpc_new_phys || count <= mpc_new_length) {
++              WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
++              return -1;
++      }
++
++      return ret;
++}
++#else /* CONFIG_X86_IO_APIC */
++static
++inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
++#endif /* CONFIG_X86_IO_APIC */
++
++static int  __init replace_intsrc_all(struct mpc_table *mpc,
++                                      unsigned long mpc_new_phys,
++                                      unsigned long mpc_new_length)
++{
++#ifdef CONFIG_X86_IO_APIC
++      int i;
++#endif
++      int count = sizeof(*mpc);
++      int nr_m_spare = 0;
++      unsigned char *mpt = ((unsigned char *)mpc) + count;
++
++      printk(KERN_INFO "mpc_length %x\n", mpc->length);
++      while (count < mpc->length) {
++              switch (*mpt) {
++              case MP_PROCESSOR:
++                      skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
++                      break;
++              case MP_BUS:
++                      skip_entry(&mpt, &count, sizeof(struct mpc_bus));
++                      break;
++              case MP_IOAPIC:
++                      skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
++                      break;
++              case MP_INTSRC:
++                      check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
++                      skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
++                      break;
++              case MP_LINTSRC:
++                      skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
++                      break;
++              default:
++                      /* wrong mptable */
++                      smp_dump_mptable(mpc, mpt);
++                      goto out;
++              }
++      }
++
++#ifdef CONFIG_X86_IO_APIC
++      for (i = 0; i < mp_irq_entries; i++) {
++              if (irq_used[i])
++                      continue;
++
++              if (mp_irqs[i].irqtype != mp_INT)
++                      continue;
++
++              if (mp_irqs[i].irqflag != 0x0f)
++                      continue;
++
++              if (nr_m_spare > 0) {
++                      apic_printk(APIC_VERBOSE, "*NEW* found\n");
++                      nr_m_spare--;
++                      memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
++                      m_spare[nr_m_spare] = NULL;
++              } else {
++                      struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
++                      count += sizeof(struct mpc_intsrc);
++                      if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
++                              goto out;
++                      memcpy(m, &mp_irqs[i], sizeof(*m));
++                      mpc->length = count;
++                      mpt += sizeof(struct mpc_intsrc);
++              }
++              print_mp_irq_info(&mp_irqs[i]);
++      }
++#endif
++out:
++      /* update checksum */
++      mpc->checksum = 0;
++      mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length);
++
++      return 0;
++}
++
++int enable_update_mptable;
++
++static int __init update_mptable_setup(char *str)
++{
++      enable_update_mptable = 1;
++#ifdef CONFIG_PCI
++      pci_routeirq = 1;
++#endif
++      return 0;
++}
++early_param("update_mptable", update_mptable_setup);
++
++static unsigned long __initdata mpc_new_phys;
++static unsigned long mpc_new_length __initdata = 4096;
++
++/* alloc_mptable or alloc_mptable=4k */
++static int __initdata alloc_mptable;
++static int __init parse_alloc_mptable_opt(char *p)
++{
++      enable_update_mptable = 1;
++#ifdef CONFIG_PCI
++      pci_routeirq = 1;
++#endif
++      alloc_mptable = 1;
++      if (!p)
++              return 0;
++      mpc_new_length = memparse(p, &p);
++      return 0;
++}
++early_param("alloc_mptable", parse_alloc_mptable_opt);
++
++void __init early_reserve_e820_mpc_new(void)
++{
++      if (enable_update_mptable && alloc_mptable) {
++              u64 startt = 0;
++              mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
++      }
++}
++
++static int __init update_mp_table(void)
++{
++      char str[16];
++      char oem[10];
++      struct mpf_intel *mpf;
++      struct mpc_table *mpc, *mpc_new;
++
++      if (!enable_update_mptable)
++              return 0;
++
++      mpf = mpf_found;
++      if (!mpf)
++              return 0;
++
++      /*
++       * Now see if we need to go further.
++       */
++      if (mpf->feature1 != 0)
++              return 0;
++
++      if (!mpf->physptr)
++              return 0;
++
++      mpc = _bus_to_virt(mpf->physptr);
++
++      if (!smp_check_mpc(mpc, oem, str))
++              return 0;
++
++      printk(KERN_INFO "mpf: %llx\n", (u64)arbitrary_virt_to_machine(mpf));
++      printk(KERN_INFO "physptr: %x\n", mpf->physptr);
++
++      if (mpc_new_phys && mpc->length > mpc_new_length) {
++              mpc_new_phys = 0;
++              printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
++                       mpc_new_length);
++      }
++
++      if (!mpc_new_phys) {
++              unsigned char old, new;
++              /* check if we can change the position */
++              mpc->checksum = 0;
++              old = mpf_checksum((unsigned char *)mpc, mpc->length);
++              mpc->checksum = 0xff;
++              new = mpf_checksum((unsigned char *)mpc, mpc->length);
++              if (old == new) {
++                      printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
++                      return 0;
++              }
++              printk(KERN_INFO "use in-position replacing\n");
++      } else {
++              maddr_t mpc_new_bus;
++
++              mpc_new_bus = phys_to_machine(mpc_new_phys);
++              mpf->physptr = mpc_new_bus;
++              mpc_new = phys_to_virt(mpc_new_phys);
++              memcpy(mpc_new, mpc, mpc->length);
++              mpc = mpc_new;
++              /* check if we can modify that */
++              if (mpc_new_bus - mpf->physptr) {
++                      struct mpf_intel *mpf_new;
++                      /* steal 16 bytes from [0, 1k) */
++                      printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
++                      mpf_new = isa_bus_to_virt(0x400 - 16);
++                      memcpy(mpf_new, mpf, 16);
++                      mpf = mpf_new;
++                      mpf->physptr = mpc_new_bus;
++              }
++              mpf->checksum = 0;
++              mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
++              printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
++      }
++
++      /*
++       * only replace the one with mp_INT and
++       *       MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
++       * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
++       * may need pci=routeirq for all coverage
++       */
++      replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
++
++      return 0;
++}
++
++late_initcall(update_mp_table);
diff --cc arch/x86/kernel/msr-xen.c

index 0000000,0000000..c6adfc2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/msr-xen.c
@@@ -1,0 -1,0 +1,339 @@@
++#ifndef CONFIG_XEN_PRIVILEGED_GUEST
++#include "msr.c"
++#else
++/* ----------------------------------------------------------------------- *
++ *
++ *   Copyright 2010 Novell, Inc.
++ *
++ *   This program is free software; you can redistribute it and/or modify
++ *   it under the terms of the GNU General Public License as published by
++ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
++ *   USA; either version 2 of the License, or (at your option) any later
++ *   version; incorporated herein by reference.
++ *
++ * ----------------------------------------------------------------------- */
++
++/*
++ * x86 MSR access device
++ *
++ * This device is accessed by lseek() to the appropriate register number
++ * and then read/write in chunks of 8 bytes.  A larger size means multiple
++ * reads or writes of the same register.
++ *
++ * This driver uses /dev/xen/cpu/%d/msr where %d correlates to the minor
++ * number, and on an SMP box will direct the access to pCPU %d.
++ */
++
++static int msr_init(void);
++static void msr_exit(void);
++
++#define msr_init(args...) _msr_init(args)
++#define msr_exit(args...) _msr_exit(args)
++#include "msr.c"
++#undef msr_exit
++#undef msr_init
++
++#include <linux/slab.h>
++#include <xen/pcpu.h>
++
++static struct class *pmsr_class;
++static unsigned int minor_bias = 10;
++static unsigned int nr_xen_cpu_ids;
++static unsigned long *xen_cpu_online_map;
++
++#define PMSR_DEV(cpu) MKDEV(MSR_MAJOR, (cpu) + minor_bias)
++
++static unsigned int pmsr_minor(struct inode *inode)
++{
++      return iminor(inode) - minor_bias;
++}
++
++static ssize_t pmsr_read(struct file *file, char __user *buf,
++                       size_t count, loff_t *ppos)
++{
++      u32 __user *tmp = (u32 __user *) buf;
++      u32 data[2];
++      u32 reg = *ppos;
++      unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode);
++      int err = 0;
++      ssize_t bytes = 0;
++
++      if (count % 8)
++              return -EINVAL; /* Invalid chunk size */
++
++      for (; count; count -= 8) {
++              err = rdmsr_safe_on_pcpu(cpu, reg, &data[0], &data[1]);
++              if (err)
++                      break;
++              if (copy_to_user(tmp, &data, 8)) {
++                      err = -EFAULT;
++                      break;
++              }
++              tmp += 2;
++              bytes += 8;
++      }
++
++      return bytes ? bytes : err;
++}
++
++static ssize_t pmsr_write(struct file *file, const char __user *buf,
++                        size_t count, loff_t *ppos)
++{
++      const u32 __user *tmp = (const u32 __user *)buf;
++      u32 data[2];
++      u32 reg = *ppos;
++      unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode);
++      int err = 0;
++      ssize_t bytes = 0;
++
++      if (count % 8)
++              return -EINVAL; /* Invalid chunk size */
++
++      for (; count; count -= 8) {
++              if (copy_from_user(&data, tmp, 8)) {
++                      err = -EFAULT;
++                      break;
++              }
++              err = wrmsr_safe_on_pcpu(cpu, reg, data[0], data[1]);
++              if (err)
++                      break;
++              tmp += 2;
++              bytes += 8;
++      }
++
++      return bytes ? bytes : err;
++}
++
++static long pmsr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
++{
++      u32 __user *uregs = (u32 __user *)arg;
++      u32 regs[8];
++      unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode);
++      int err;
++
++      switch (ioc) {
++      case X86_IOC_RDMSR_REGS:
++              if (!(file->f_mode & FMODE_READ)) {
++                      err = -EBADF;
++                      break;
++              }
++              if (copy_from_user(&regs, uregs, sizeof regs)) {
++                      err = -EFAULT;
++                      break;
++              }
++              err = rdmsr_safe_regs_on_pcpu(cpu, regs);
++              if (err)
++                      break;
++              if (copy_to_user(uregs, &regs, sizeof regs))
++                      err = -EFAULT;
++              break;
++
++      case X86_IOC_WRMSR_REGS:
++              if (!(file->f_mode & FMODE_WRITE)) {
++                      err = -EBADF;
++                      break;
++              }
++              if (copy_from_user(&regs, uregs, sizeof regs)) {
++                      err = -EFAULT;
++                      break;
++              }
++              err = wrmsr_safe_regs_on_pcpu(cpu, regs);
++              if (err)
++                      break;
++              if (copy_to_user(uregs, &regs, sizeof regs))
++                      err = -EFAULT;
++              break;
++
++      default:
++              err = -ENOTTY;
++              break;
++      }
++
++      return err;
++}
++
++static int pmsr_open(struct inode *inode, struct file *file)
++{
++      unsigned int cpu;
++
++      cpu = pmsr_minor(file->f_path.dentry->d_inode);
++      if (cpu >= nr_xen_cpu_ids || !test_bit(cpu, xen_cpu_online_map))
++              return -ENXIO;  /* No such CPU */
++
++      return 0;
++}
++
++/*
++ * File operations we support
++ */
++static const struct file_operations pmsr_fops = {
++      .owner = THIS_MODULE,
++      .llseek = msr_seek,
++      .read = pmsr_read,
++      .write = pmsr_write,
++      .open = pmsr_open,
++      .unlocked_ioctl = pmsr_ioctl,
++      .compat_ioctl = pmsr_ioctl,
++};
++
++static int pmsr_device_create(unsigned int cpu)
++{
++      struct device *dev;
++
++      if (cpu >= nr_xen_cpu_ids) {
++              static bool warned;
++              unsigned long *map;
++
++              if ((minor_bias + cpu) >> MINORBITS) {
++                      if (!warned) {
++                              warned = true;
++                              pr_warn("Physical MSRs of CPUs beyond %u"
++                                      " will not be accessible\n",
++                                      MINORMASK - minor_bias);
++                      }
++                      return -EDOM;
++              }
++
++              map = kzalloc(BITS_TO_LONGS(cpu + 1) * sizeof(*map),
++                            GFP_KERNEL);
++              if (!map) {
++                      if (!warned) {
++                              warned = true;
++                              pr_warn("Physical MSRs of CPUs beyond %u"
++                                      " may not be accessible\n",
++                                      nr_xen_cpu_ids - 1);
++                      }
++                      return -ENOMEM;
++              }
++
++              memcpy(map, xen_cpu_online_map,
++                     BITS_TO_LONGS(nr_xen_cpu_ids)
++                     * sizeof(*xen_cpu_online_map));
++              nr_xen_cpu_ids = min_t(unsigned int,
++                                   BITS_TO_LONGS(cpu + 1) * BITS_PER_LONG,
++                                   MINORMASK + 1 - minor_bias);
++              kfree(xchg(&xen_cpu_online_map, map));
++      }
++      set_bit(cpu, xen_cpu_online_map);
++      dev = device_create(pmsr_class, NULL, PMSR_DEV(cpu), NULL,
++                          "pmsr%d", cpu);
++      return IS_ERR(dev) ? PTR_ERR(dev) : 0;
++}
++
++static void pmsr_device_destroy(unsigned int cpu)
++{
++      clear_bit(cpu, xen_cpu_online_map);
++      device_destroy(pmsr_class, PMSR_DEV(cpu));
++}
++
++static int pmsr_cpu_callback(struct notifier_block *nfb,
++                           unsigned long action, void *hcpu)
++{
++      unsigned int cpu = (unsigned long)hcpu;
++
++      switch (action) {
++      case CPU_ONLINE:
++              pmsr_device_create(cpu);
++              break;
++      case CPU_DEAD:
++              pmsr_device_destroy(cpu);
++              break;
++      }
++      return NOTIFY_OK;
++}
++
++static struct notifier_block pmsr_cpu_notifier = {
++      .notifier_call = pmsr_cpu_callback,
++};
++
++static char *pmsr_devnode(struct device *dev, mode_t *mode)
++{
++      return kasprintf(GFP_KERNEL, "xen/cpu/%u/msr",
++                       MINOR(dev->devt) - minor_bias);
++}
++
++static int __init msr_init(void)
++{
++      int err;
++      xen_platform_op_t op = {
++              .cmd                   = XENPF_get_cpuinfo,
++              .interface_version     = XENPF_INTERFACE_VERSION,
++              .u.pcpu_info.xen_cpuid = 0
++      };
++
++      err = _msr_init();
++      if (err || !is_initial_xendomain())
++              return err;
++
++      do {
++              err = HYPERVISOR_platform_op(&op);
++      } while (err == -EBUSY);
++      if (err)
++              goto out;
++      nr_xen_cpu_ids = BITS_TO_LONGS(op.u.pcpu_info.max_present + 1)
++                       * BITS_PER_LONG;
++
++      while (minor_bias < NR_CPUS)
++              minor_bias *= 10;
++      if ((minor_bias + nr_xen_cpu_ids - 1) >> MINORBITS)
++              minor_bias = NR_CPUS;
++      if ((minor_bias + nr_xen_cpu_ids - 1) >> MINORBITS)
++              nr_xen_cpu_ids = MINORMASK + 1 - NR_CPUS;
++
++      xen_cpu_online_map = kzalloc(BITS_TO_LONGS(nr_xen_cpu_ids)
++                                   * sizeof(*xen_cpu_online_map),
++                                   GFP_KERNEL);
++      if (!xen_cpu_online_map) {
++              err = -ENOMEM;
++              goto out;
++      }
++
++      if (__register_chrdev(MSR_MAJOR, minor_bias,
++                            MINORMASK + 1 - minor_bias,
++                            "pcpu/msr", &pmsr_fops)) {
++              pr_err("msr: unable to get minors for pmsr\n");
++              goto out;
++      }
++      pmsr_class = class_create(THIS_MODULE, "pmsr");
++      if (IS_ERR(pmsr_class)) {
++              err = PTR_ERR(pmsr_class);
++              goto out_chrdev;
++      }
++      pmsr_class->devnode = pmsr_devnode;
++      err = register_pcpu_notifier(&pmsr_cpu_notifier);
++
++      if (!err && !nr_xen_cpu_ids)
++              err = -ENODEV;
++      if (!err)
++              return 0;
++
++      class_destroy(pmsr_class);
++
++out_chrdev:
++      __unregister_chrdev(MSR_MAJOR, minor_bias,
++                          MINORMASK + 1 - minor_bias, "pcpu/msr");
++out:
++      if (err)
++              pr_warn("msr: can't initialize physical MSR access (%d)\n",
++                      err);
++      nr_xen_cpu_ids = 0;
++      kfree(xen_cpu_online_map);
++      return 0;
++}
++
++static void __exit msr_exit(void)
++{
++      if (nr_xen_cpu_ids) {
++              unsigned int cpu = 0;
++
++              unregister_pcpu_notifier(&pmsr_cpu_notifier);
++              for_each_set_bit(cpu, xen_cpu_online_map, nr_xen_cpu_ids)
++                      msr_device_destroy(cpu);
++              class_destroy(pmsr_class);
++              __unregister_chrdev(MSR_MAJOR, minor_bias,
++                                  MINORMASK + 1 - minor_bias, "pcpu/msr");
++              kfree(xen_cpu_online_map);
++      }
++      _msr_exit();
++}
++#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
diff --cc arch/x86/kernel/pci-dma-xen.c

index 0000000,0000000..35e62f1

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/pci-dma-xen.c
@@@ -1,0 -1,0 +1,414 @@@
++#include <linux/dma-mapping.h>
++#include <linux/dma-debug.h>
++#include <linux/dmar.h>
++#include <linux/bootmem.h>
++#include <linux/gfp.h>
++#include <linux/pci.h>
++#include <linux/kmemleak.h>
++
++#include <asm/proto.h>
++#include <asm/dma.h>
++#include <asm/iommu.h>
++#include <asm/gart.h>
++#include <asm/calgary.h>
++#include <asm/x86_init.h>
++#include <asm/iommu_table.h>
++
++static int forbid_dac __read_mostly;
++
++struct dma_map_ops *dma_ops = &nommu_dma_ops;
++EXPORT_SYMBOL(dma_ops);
++
++static int iommu_sac_force __read_mostly;
++
++#ifdef CONFIG_IOMMU_DEBUG
++int panic_on_overflow __read_mostly = 1;
++int force_iommu __read_mostly = 1;
++#else
++int panic_on_overflow __read_mostly = 0;
++int force_iommu __read_mostly = 0;
++#endif
++
++int iommu_merge __read_mostly = 0;
++
++int no_iommu __read_mostly;
++/* Set this to 1 if there is a HW IOMMU in the system */
++int iommu_detected __read_mostly = 0;
++
++/*
++ * This variable becomes 1 if iommu=pt is passed on the kernel command line.
++ * If this variable is 1, IOMMU implementations do no DMA translation for
++ * devices and allow every device to access to whole physical memory. This is
++ * useful if a user wants to use an IOMMU only for KVM device assignment to
++ * guests and not for driver dma translation.
++ */
++int iommu_pass_through __read_mostly;
++
++extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
++
++/* Dummy device used for NULL arguments (normally ISA). */
++struct device x86_dma_fallback_dev = {
++      .init_name = "fallback device",
++      .coherent_dma_mask = ISA_DMA_BIT_MASK,
++      .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
++};
++EXPORT_SYMBOL(x86_dma_fallback_dev);
++
++/* Number of entries preallocated for DMA-API debugging */
++#define PREALLOC_DMA_DEBUG_ENTRIES       32768
++
++int dma_set_mask(struct device *dev, u64 mask)
++{
++      if (!dev->dma_mask || !dma_supported(dev, mask))
++              return -EIO;
++
++      *dev->dma_mask = mask;
++
++      return 0;
++}
++EXPORT_SYMBOL(dma_set_mask);
++
++#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA) && !defined(CONFIG_XEN)
++static __initdata void *dma32_bootmem_ptr;
++static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
++
++static int __init parse_dma32_size_opt(char *p)
++{
++      if (!p)
++              return -EINVAL;
++      dma32_bootmem_size = memparse(p, &p);
++      return 0;
++}
++early_param("dma32_size", parse_dma32_size_opt);
++
++void __init dma32_reserve_bootmem(void)
++{
++      unsigned long size, align;
++      if (max_pfn <= MAX_DMA32_PFN)
++              return;
++
++      /*
++       * check aperture_64.c allocate_aperture() for reason about
++       * using 512M as goal
++       */
++      align = 64ULL<<20;
++      size = roundup(dma32_bootmem_size, align);
++      dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
++                               512ULL<<20);
++      /*
++       * Kmemleak should not scan this block as it may not be mapped via the
++       * kernel direct mapping.
++       */
++      kmemleak_ignore(dma32_bootmem_ptr);
++      if (dma32_bootmem_ptr)
++              dma32_bootmem_size = size;
++      else
++              dma32_bootmem_size = 0;
++}
++static void __init dma32_free_bootmem(void)
++{
++
++      if (max_pfn <= MAX_DMA32_PFN)
++              return;
++
++      if (!dma32_bootmem_ptr)
++              return;
++
++      free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
++
++      dma32_bootmem_ptr = NULL;
++      dma32_bootmem_size = 0;
++}
++#else
++void __init dma32_reserve_bootmem(void)
++{
++}
++static void __init dma32_free_bootmem(void)
++{
++}
++
++#endif
++
++static struct dma_map_ops swiotlb_dma_ops = {
++      .alloc_coherent = dma_generic_alloc_coherent,
++      .free_coherent = dma_generic_free_coherent,
++      .mapping_error = swiotlb_dma_mapping_error,
++      .map_page = swiotlb_map_page,
++      .unmap_page = swiotlb_unmap_page,
++      .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
++      .sync_single_for_device = swiotlb_sync_single_for_device,
++      .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
++      .sync_sg_for_device = swiotlb_sync_sg_for_device,
++      .map_sg = swiotlb_map_sg_attrs,
++      .unmap_sg = swiotlb_unmap_sg_attrs,
++      .dma_supported = swiotlb_dma_supported
++};
++
++static int __init pci_xen_swiotlb_detect(void)
++{
++      return 1;
++}
++
++static void __init pci_xen_swiotlb_init(void)
++{
++      swiotlb_init(1);
++      if (swiotlb) {
++              printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
++              dma_ops = &swiotlb_dma_ops;
++      }
++}
++
++IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, NULL, pci_xen_swiotlb_init, NULL);
++
++void __init pci_iommu_alloc(void)
++{
++      struct iommu_table_entry *p;
++
++      /* free the range so iommu could get some range less than 4G */
++      dma32_free_bootmem();
++
++      sort_iommu_table(__iommu_table, __iommu_table_end);
++      check_iommu_entries(__iommu_table, __iommu_table_end);
++
++      for (p = __iommu_table; p < __iommu_table_end; p++) {
++              if (p && p->detect && p->detect() > 0) {
++                      p->flags |= IOMMU_DETECTED;
++                      if (p->early_init)
++                              p->early_init();
++                      if (p->flags & IOMMU_FINISH_IF_DETECTED)
++                              break;
++              }
++      }
++}
++void *dma_generic_alloc_coherent(struct device *dev, size_t size,
++                               dma_addr_t *dma_addr, gfp_t flag)
++{
++      unsigned long dma_mask;
++      struct page *page;
++#ifndef CONFIG_XEN
++      dma_addr_t addr;
++#else
++      void *memory;
++#endif
++      unsigned int order = get_order(size);
++
++      dma_mask = dma_alloc_coherent_mask(dev, flag);
++
++#ifndef CONFIG_XEN
++      flag |= __GFP_ZERO;
++again:
++#else
++      flag &= ~(__GFP_DMA | __GFP_DMA32);
++#endif
++      page = alloc_pages_node(dev_to_node(dev), flag, order);
++      if (!page)
++              return NULL;
++
++#ifndef CONFIG_XEN
++      addr = page_to_phys(page);
++      if (addr + size > dma_mask) {
++              __free_pages(page, order);
++
++              if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
++                      flag = (flag & ~GFP_DMA32) | GFP_DMA;
++                      goto again;
++              }
++
++              return NULL;
++      }
++
++      *dma_addr = addr;
++      return page_address(page);
++#else
++      memory = page_address(page);
++      if (xen_create_contiguous_region((unsigned long)memory, order,
++                                       fls64(dma_mask))) {
++              __free_pages(page, order);
++              return NULL;
++      }
++
++      *dma_addr = virt_to_bus(memory);
++      return memset(memory, 0, size);
++#endif
++}
++
++#ifdef CONFIG_XEN
++void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
++                             dma_addr_t dma_addr)
++{
++      unsigned int order = get_order(size);
++      unsigned long va = (unsigned long)vaddr;
++
++      xen_destroy_contiguous_region(va, order);
++      free_pages(va, order);
++}
++#endif
++
++/*
++ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
++ * documentation.
++ */
++static __init int iommu_setup(char *p)
++{
++      iommu_merge = 1;
++
++      if (!p)
++              return -EINVAL;
++
++      while (*p) {
++              if (!strncmp(p, "off", 3))
++                      no_iommu = 1;
++              /* gart_parse_options has more force support */
++              if (!strncmp(p, "force", 5))
++                      force_iommu = 1;
++              if (!strncmp(p, "noforce", 7)) {
++                      iommu_merge = 0;
++                      force_iommu = 0;
++              }
++
++              if (!strncmp(p, "biomerge", 8)) {
++                      iommu_merge = 1;
++                      force_iommu = 1;
++              }
++              if (!strncmp(p, "panic", 5))
++                      panic_on_overflow = 1;
++              if (!strncmp(p, "nopanic", 7))
++                      panic_on_overflow = 0;
++              if (!strncmp(p, "merge", 5)) {
++                      iommu_merge = 1;
++                      force_iommu = 1;
++              }
++              if (!strncmp(p, "nomerge", 7))
++                      iommu_merge = 0;
++              if (!strncmp(p, "forcesac", 8))
++                      iommu_sac_force = 1;
++              if (!strncmp(p, "allowdac", 8))
++                      forbid_dac = 0;
++              if (!strncmp(p, "nodac", 5))
++                      forbid_dac = 1;
++              if (!strncmp(p, "usedac", 6)) {
++                      forbid_dac = -1;
++                      return 1;
++              }
++#ifdef CONFIG_SWIOTLB
++              if (!strncmp(p, "soft", 4))
++                      swiotlb = 1;
++#endif
++              if (!strncmp(p, "pt", 2))
++                      iommu_pass_through = 1;
++
++              gart_parse_options(p);
++
++#ifdef CONFIG_CALGARY_IOMMU
++              if (!strncmp(p, "calgary", 7))
++                      use_calgary = 1;
++#endif /* CONFIG_CALGARY_IOMMU */
++
++              p += strcspn(p, ",");
++              if (*p == ',')
++                      ++p;
++      }
++      return 0;
++}
++early_param("iommu", iommu_setup);
++
++static int check_pages_physically_contiguous(unsigned long pfn,
++                                           unsigned int offset,
++                                           size_t length)
++{
++      unsigned long next_mfn;
++      int i;
++      int nr_pages;
++
++      next_mfn = pfn_to_mfn(pfn);
++      nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
++
++      for (i = 1; i < nr_pages; i++) {
++              if (pfn_to_mfn(++pfn) != ++next_mfn)
++                      return 0;
++      }
++      return 1;
++}
++
++int range_straddles_page_boundary(paddr_t p, size_t size)
++{
++      unsigned long pfn = p >> PAGE_SHIFT;
++      unsigned int offset = p & ~PAGE_MASK;
++
++      return ((offset + size > PAGE_SIZE) &&
++              !check_pages_physically_contiguous(pfn, offset, size));
++}
++
++int dma_supported(struct device *dev, u64 mask)
++{
++      struct dma_map_ops *ops = get_dma_ops(dev);
++
++#ifdef CONFIG_PCI
++      if (mask > 0xffffffff && forbid_dac > 0) {
++              dev_info(dev, "PCI: Disallowing DAC for device\n");
++              return 0;
++      }
++#endif
++
++      if (ops->dma_supported)
++              return ops->dma_supported(dev, mask);
++
++      /* Copied from i386. Doesn't make much sense, because it will
++         only work for pci_alloc_coherent.
++         The caller just has to use GFP_DMA in this case. */
++      if (mask < DMA_BIT_MASK(24))
++              return 0;
++
++      /* Tell the device to use SAC when IOMMU force is on.  This
++         allows the driver to use cheaper accesses in some cases.
++
++         Problem with this is that if we overflow the IOMMU area and
++         return DAC as fallback address the device may not handle it
++         correctly.
++
++         As a special case some controllers have a 39bit address
++         mode that is as efficient as 32bit (aic79xx). Don't force
++         SAC for these.  Assume all masks <= 40 bits are of this
++         type. Normally this doesn't make any difference, but gives
++         more gentle handling of IOMMU overflow. */
++      if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
++              dev_info(dev, "Force SAC with mask %Lx\n", mask);
++              return 0;
++      }
++
++      return 1;
++}
++EXPORT_SYMBOL(dma_supported);
++
++static int __init pci_iommu_init(void)
++{
++      struct iommu_table_entry *p;
++      dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
++
++#ifdef CONFIG_PCI
++      dma_debug_add_bus(&pci_bus_type);
++#endif
++      x86_init.iommu.iommu_init();
++
++      for (p = __iommu_table; p < __iommu_table_end; p++) {
++              if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
++                      p->late_init();
++      }
++
++      return 0;
++}
++/* Must execute after PCI subsystem */
++rootfs_initcall(pci_iommu_init);
++
++#ifdef CONFIG_PCI
++/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
++
++static __devinit void via_no_dac(struct pci_dev *dev)
++{
++      if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
++              dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
++              forbid_dac = 1;
++      }
++}
++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
++#endif
diff --cc arch/x86/kernel/pci-nommu-xen.c

index 0000000,0000000..9dc9d8e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/pci-nommu-xen.c
@@@ -1,0 -1,0 +1,114 @@@
++#include <linux/dma-mapping.h>
++#include <linux/dmar.h>
++#include <linux/bootmem.h>
++#include <linux/pci.h>
++
++#include <xen/gnttab.h>
++
++#include <asm/iommu.h>
++#include <asm/proto.h>
++#include <asm/dma.h>
++#include <asm/swiotlb.h>
++#include <asm/tlbflush.h>
++#include <asm/gnttab_dma.h>
++#include <asm/bug.h>
++
++#define IOMMU_BUG_ON(test)                            \
++do {                                                  \
++      if (unlikely(test)) {                           \
++              printk(KERN_ALERT "Fatal DMA error! "   \
++                     "Please use 'swiotlb=force'\n"); \
++              BUG();                                  \
++      }                                               \
++} while (0)
++
++static int
++gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
++            enum dma_data_direction dir, struct dma_attrs *attrs)
++{
++      unsigned int i;
++      struct scatterlist *sg;
++
++      WARN_ON(nents == 0 || sgl->length == 0);
++
++      for_each_sg(sgl, sg, nents, i) {
++              BUG_ON(!sg_page(sg));
++              sg->dma_address =
++                      gnttab_dma_map_page(sg_page(sg)) + sg->offset;
++              sg->dma_length  = sg->length;
++              IOMMU_BUG_ON(!dma_capable(
++                      hwdev, sg->dma_address, sg->length));
++              IOMMU_BUG_ON(range_straddles_page_boundary(
++                      page_to_pseudophys(sg_page(sg)) + sg->offset,
++                      sg->length));
++      }
++
++      return nents;
++}
++
++static void
++gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
++              enum dma_data_direction dir, struct dma_attrs *attrs)
++{
++      unsigned int i;
++      struct scatterlist *sg;
++
++      for_each_sg(sgl, sg, nents, i)
++              gnttab_dma_unmap_page(sg->dma_address);
++}
++
++static dma_addr_t
++gnttab_map_page(struct device *dev, struct page *page, unsigned long offset,
++              size_t size, enum dma_data_direction dir,
++              struct dma_attrs *attrs)
++{
++      dma_addr_t dma;
++
++      WARN_ON(size == 0);
++
++      dma = gnttab_dma_map_page(page) + offset;
++      IOMMU_BUG_ON(range_straddles_page_boundary(page_to_pseudophys(page) +
++                                                 offset, size));
++      IOMMU_BUG_ON(!dma_capable(dev, dma, size));
++
++      return dma;
++}
++
++static void
++gnttab_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
++                enum dma_data_direction dir, struct dma_attrs *attrs)
++{
++      gnttab_dma_unmap_page(dma_addr);
++}
++
++static void nommu_sync_single_for_device(struct device *dev,
++                      dma_addr_t addr, size_t size,
++                      enum dma_data_direction dir)
++{
++      flush_write_buffers();
++}
++
++
++static void nommu_sync_sg_for_device(struct device *dev,
++                      struct scatterlist *sg, int nelems,
++                      enum dma_data_direction dir)
++{
++      flush_write_buffers();
++}
++
++static int nommu_dma_supported(struct device *hwdev, u64 mask)
++{
++      return 1;
++}
++
++struct dma_map_ops nommu_dma_ops = {
++      .alloc_coherent         = dma_generic_alloc_coherent,
++      .free_coherent          = dma_generic_free_coherent,
++      .map_page               = gnttab_map_page,
++      .unmap_page             = gnttab_unmap_page,
++      .map_sg                 = gnttab_map_sg,
++      .unmap_sg               = gnttab_unmap_sg,
++      .sync_single_for_device = nommu_sync_single_for_device,
++      .sync_sg_for_device     = nommu_sync_sg_for_device,
++      .dma_supported          = nommu_dma_supported,
++};
diff --cc arch/x86/kernel/pcspeaker.c

index a311ffc,a311ffc..965c549
--- 1/arch/x86/kernel/pcspeaker.c
--- 2/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@@ -6,6 -6,6 +6,11 @@@ static __init int add_pcspkr(void
   {
         struct platform_device *pd;
   
++#ifdef CONFIG_XEN
++      if (!is_initial_xendomain())
++              return 0;
++#endif
++
         pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
   
         return IS_ERR(pd) ? PTR_ERR(pd) : 0;
diff --cc arch/x86/kernel/probe_roms_32.c

index 0000000,071e7fe..3ff0781

mode 000000,100644..100644
--- /dev/null
--- 2/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms_32.c
@@@ -1,0 -1,166 +1,166 @@@
+ #include <linux/sched.h>
+ #include <linux/mm.h>
+ #include <linux/uaccess.h>
+ #include <linux/mmzone.h>
+ #include <linux/ioport.h>
+ #include <linux/seq_file.h>
+ #include <linux/console.h>
+ #include <linux/init.h>
+ #include <linux/edd.h>
+ #include <linux/dmi.h>
+ #include <linux/pfn.h>
+ #include <linux/pci.h>
+ #include <asm/pci-direct.h>
+ 
+ 
+ #include <asm/e820.h>
+ #include <asm/mmzone.h>
+ #include <asm/setup.h>
+ #include <asm/sections.h>
+ #include <asm/io.h>
+ #include <asm/setup_arch.h>
+ 
+ static struct resource system_rom_resource = {
+       .name   = "System ROM",
+       .start  = 0xf0000,
+       .end    = 0xfffff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+ };
+ 
+ static struct resource extension_rom_resource = {
+       .name   = "Extension ROM",
+       .start  = 0xe0000,
+       .end    = 0xeffff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+ };
+ 
+ static struct resource adapter_rom_resources[] = { {
+       .name   = "Adapter ROM",
+       .start  = 0xc8000,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+ }, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+ }, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+ }, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+ }, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+ }, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+ } };
+ 
+ static struct resource video_rom_resource = {
+       .name   = "Video ROM",
+       .start  = 0xc0000,
+       .end    = 0xc7fff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+ };
+ 
+ #define ROMSIGNATURE 0xaa55
+ 
+ static int __init romsignature(const unsigned char *rom)
+ {
+       const unsigned short * const ptr = (const unsigned short *)rom;
+       unsigned short sig;
+ 
+       return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+ }
+ 
+ static int __init romchecksum(const unsigned char *rom, unsigned long length)
+ {
+       unsigned char sum, c;
+ 
+       for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+               sum += c;
+       return !length && !sum;
+ }
+ 
+ void __init probe_roms(void)
+ {
+       const unsigned char *rom;
+       unsigned long start, length, upper;
+       unsigned char c;
+       int i;
+ 
+       /* video rom */
+       upper = adapter_rom_resources[0].start;
+       for (start = video_rom_resource.start; start < upper; start += 2048) {
+               rom = isa_bus_to_virt(start);
+               if (!romsignature(rom))
+                       continue;
+ 
+               video_rom_resource.start = start;
+ 
+               if (probe_kernel_address(rom + 2, c) != 0)
+                       continue;
+ 
+               /* 0 < length <= 0x7f * 512, historically */
+               length = c * 512;
+ 
+               /* if checksum okay, trust length byte */
+               if (length && romchecksum(rom, length))
+                       video_rom_resource.end = start + length - 1;
+ 
+               request_resource(&iomem_resource, &video_rom_resource);
+               break;
+       }
+ 
+       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+       if (start < upper)
+               start = upper;
+ 
+       /* system rom */
+       request_resource(&iomem_resource, &system_rom_resource);
+       upper = system_rom_resource.start;
+ 
+       /* check for extension rom (ignore length byte!) */
- -      rom = isa_bus_to_virt(extension_rom_resource.start);
++      rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
+       if (romsignature(rom)) {
+               length = extension_rom_resource.end - extension_rom_resource.start + 1;
+               if (romchecksum(rom, length)) {
+                       request_resource(&iomem_resource, &extension_rom_resource);
+                       upper = extension_rom_resource.start;
+               }
+       }
+ 
+       /* check for adapter roms on 2k boundaries */
+       for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+               rom = isa_bus_to_virt(start);
+               if (!romsignature(rom))
+                       continue;
+ 
+               if (probe_kernel_address(rom + 2, c) != 0)
+                       continue;
+ 
+               /* 0 < length <= 0x7f * 512, historically */
+               length = c * 512;
+ 
+               /* but accept any length that fits if checksum okay */
+               if (!length || start + length > upper || !romchecksum(rom, length))
+                       continue;
+ 
+               adapter_rom_resources[i].start = start;
+               adapter_rom_resources[i].end = start + length - 1;
+               request_resource(&iomem_resource, &adapter_rom_resources[i]);
+ 
+               start = adapter_rom_resources[i++].end & ~2047UL;
+       }
+ }
+ 
diff --cc arch/x86/kernel/process-xen.c

index 0000000,0000000..417adbb

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/process-xen.c
@@@ -1,0 -1,0 +1,642 @@@
++#include <linux/errno.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/smp.h>
++#include <linux/prctl.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <linux/module.h>
++#include <linux/pm.h>
++#include <linux/clockchips.h>
++#include <linux/random.h>
++#include <linux/user-return-notifier.h>
++#include <linux/dmi.h>
++#include <linux/utsname.h>
++#include <trace/events/power.h>
++#include <linux/hw_breakpoint.h>
++#include <asm/cpu.h>
++#include <asm/system.h>
++#include <asm/apic.h>
++#include <asm/syscalls.h>
++#include <asm/idle.h>
++#include <asm/uaccess.h>
++#include <asm/i387.h>
++#include <asm/debugreg.h>
++#include <xen/evtchn.h>
++
++struct kmem_cache *task_xstate_cachep;
++EXPORT_SYMBOL_GPL(task_xstate_cachep);
++
++int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
++{
++      int ret;
++
++      *dst = *src;
++      if (fpu_allocated(&src->thread.fpu)) {
++              memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
++              ret = fpu_alloc(&dst->thread.fpu);
++              if (ret)
++                      return ret;
++              fpu_copy(&dst->thread.fpu, &src->thread.fpu);
++      }
++      return 0;
++}
++
++void free_thread_xstate(struct task_struct *tsk)
++{
++      fpu_free(&tsk->thread.fpu);
++}
++
++void free_thread_info(struct thread_info *ti)
++{
++      free_thread_xstate(ti->task);
++      free_pages((unsigned long)ti, get_order(THREAD_SIZE));
++}
++
++void arch_task_cache_init(void)
++{
++        task_xstate_cachep =
++              kmem_cache_create("task_xstate", xstate_size,
++                                __alignof__(union thread_xstate),
++                                SLAB_PANIC | SLAB_NOTRACK, NULL);
++}
++
++/*
++ * Free current thread data structures etc..
++ */
++void exit_thread(void)
++{
++      struct task_struct *me = current;
++      struct thread_struct *t = &me->thread;
++      unsigned long *bp = t->io_bitmap_ptr;
++
++      if (bp) {
++              struct physdev_set_iobitmap set_iobitmap;
++
++              t->io_bitmap_ptr = NULL;
++              clear_thread_flag(TIF_IO_BITMAP);
++              /*
++               * Careful, clear this in the TSS too:
++               */
++              memset(&set_iobitmap, 0, sizeof(set_iobitmap));
++              WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
++                                            &set_iobitmap));
++              t->io_bitmap_max = 0;
++              kfree(bp);
++      }
++}
++
++void show_regs(struct pt_regs *regs)
++{
++      show_registers(regs);
++      show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
++}
++
++void show_regs_common(void)
++{
++      const char *vendor, *product, *board;
++
++      vendor = dmi_get_system_info(DMI_SYS_VENDOR);
++      if (!vendor)
++              vendor = "";
++      product = dmi_get_system_info(DMI_PRODUCT_NAME);
++      if (!product)
++              product = "";
++
++      /* Board Name is optional */
++      board = dmi_get_system_info(DMI_BOARD_NAME);
++
++      printk(KERN_CONT "\n");
++      printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s",
++              current->pid, current->comm, print_tainted(),
++              init_utsname()->release,
++              (int)strcspn(init_utsname()->version, " "),
++              init_utsname()->version);
++      printk(KERN_CONT " %s %s", vendor, product);
++      if (board)
++              printk(KERN_CONT "/%s", board);
++      printk(KERN_CONT "\n");
++}
++
++void flush_thread(void)
++{
++      struct task_struct *tsk = current;
++
++      flush_ptrace_hw_breakpoint(tsk);
++      memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
++      /*
++       * Forget coprocessor state..
++       */
++      tsk->fpu_counter = 0;
++      clear_fpu(tsk);
++      clear_used_math();
++}
++
++static void hard_disable_TSC(void)
++{
++      write_cr4(read_cr4() | X86_CR4_TSD);
++}
++
++void disable_TSC(void)
++{
++      preempt_disable();
++      if (!test_and_set_thread_flag(TIF_NOTSC))
++              /*
++               * Must flip the CPU state synchronously with
++               * TIF_NOTSC in the current running context.
++               */
++              hard_disable_TSC();
++      preempt_enable();
++}
++
++static void hard_enable_TSC(void)
++{
++      write_cr4(read_cr4() & ~X86_CR4_TSD);
++}
++
++static void enable_TSC(void)
++{
++      preempt_disable();
++      if (test_and_clear_thread_flag(TIF_NOTSC))
++              /*
++               * Must flip the CPU state synchronously with
++               * TIF_NOTSC in the current running context.
++               */
++              hard_enable_TSC();
++      preempt_enable();
++}
++
++int get_tsc_mode(unsigned long adr)
++{
++      unsigned int val;
++
++      if (test_thread_flag(TIF_NOTSC))
++              val = PR_TSC_SIGSEGV;
++      else
++              val = PR_TSC_ENABLE;
++
++      return put_user(val, (unsigned int __user *)adr);
++}
++
++int set_tsc_mode(unsigned int val)
++{
++      if (val == PR_TSC_SIGSEGV)
++              disable_TSC();
++      else if (val == PR_TSC_ENABLE)
++              enable_TSC();
++      else
++              return -EINVAL;
++
++      return 0;
++}
++
++void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
++{
++      struct thread_struct *prev, *next;
++
++      prev = &prev_p->thread;
++      next = &next_p->thread;
++
++      if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
++          test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
++              unsigned long debugctl = get_debugctlmsr();
++
++              debugctl &= ~DEBUGCTLMSR_BTF;
++              if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
++                      debugctl |= DEBUGCTLMSR_BTF;
++
++              update_debugctlmsr(debugctl);
++      }
++
++      if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
++          test_tsk_thread_flag(next_p, TIF_NOTSC)) {
++              /* prev and next are different */
++              if (test_tsk_thread_flag(next_p, TIF_NOTSC))
++                      hard_disable_TSC();
++              else
++                      hard_enable_TSC();
++      }
++      propagate_user_return_notify(prev_p, next_p);
++}
++
++int sys_fork(struct pt_regs *regs)
++{
++      return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
++}
++
++/*
++ * This is trivial, and on the face of it looks like it
++ * could equally well be done in user mode.
++ *
++ * Not so, for quite unobvious reasons - register pressure.
++ * In user mode vfork() cannot have a stack frame, and if
++ * done by calling the "clone()" system call directly, you
++ * do not have enough call-clobbered registers to hold all
++ * the information you need.
++ */
++int sys_vfork(struct pt_regs *regs)
++{
++      return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
++                     NULL, NULL);
++}
++
++long
++sys_clone(unsigned long clone_flags, unsigned long newsp,
++        void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
++{
++      if (!newsp)
++              newsp = regs->sp;
++      return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
++}
++
++/*
++ * This gets run with %si containing the
++ * function to call, and %di containing
++ * the "args".
++ */
++extern void kernel_thread_helper(void);
++
++/*
++ * Create a kernel thread
++ */
++int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
++{
++      struct pt_regs regs;
++
++      memset(&regs, 0, sizeof(regs));
++
++      regs.si = (unsigned long) fn;
++      regs.di = (unsigned long) arg;
++
++#ifdef CONFIG_X86_32
++      regs.ds = __USER_DS;
++      regs.es = __USER_DS;
++      regs.fs = __KERNEL_PERCPU;
++      regs.gs = __KERNEL_STACK_CANARY;
++#else
++      regs.ss = __KERNEL_DS;
++#endif
++
++      regs.orig_ax = -1;
++      regs.ip = (unsigned long) kernel_thread_helper;
++      regs.cs = __KERNEL_CS | get_kernel_rpl();
++      regs.flags = X86_EFLAGS_IF | 0x2;
++
++      /* Ok, create the new process.. */
++      return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
++}
++EXPORT_SYMBOL(kernel_thread);
++
++/*
++ * sys_execve() executes a new program.
++ */
++long sys_execve(const char __user *name,
++              const char __user *const __user *argv,
++              const char __user *const __user *envp, struct pt_regs *regs)
++{
++      long error;
++      char *filename;
++
++      filename = getname(name);
++      error = PTR_ERR(filename);
++      if (IS_ERR(filename))
++              return error;
++      error = do_execve(filename, argv, envp, regs);
++
++#ifdef CONFIG_X86_32
++      if (error == 0) {
++              /* Make sure we don't return using sysenter.. */
++                set_thread_flag(TIF_IRET);
++        }
++#endif
++
++      putname(filename);
++      return error;
++}
++
++/*
++ * Idle related variables and functions
++ */
++unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
++EXPORT_SYMBOL(boot_option_idle_override);
++
++/*
++ * Powermanagement idle function, if any..
++ */
++void (*pm_idle)(void);
++EXPORT_SYMBOL(pm_idle);
++
++/*
++ * We use this if we don't have any better
++ * idle routine..
++ */
++void xen_idle(void)
++{
++      trace_power_start(POWER_CSTATE, 1, smp_processor_id());
++      trace_cpu_idle(1, smp_processor_id());
++      current_thread_info()->status &= ~TS_POLLING;
++      /*
++       * TS_POLLING-cleared state must be visible before we
++       * test NEED_RESCHED:
++       */
++      smp_mb();
++
++      if (!need_resched())
++              safe_halt();    /* enables interrupts racelessly */
++      else
++              local_irq_enable();
++      current_thread_info()->status |= TS_POLLING;
++      trace_power_end(smp_processor_id());
++      trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
++}
++#ifdef CONFIG_APM_MODULE
++EXPORT_SYMBOL(default_idle);
++#endif
++
++void stop_this_cpu(void *dummy)
++{
++      local_irq_disable();
++      /*
++       * Remove this CPU:
++       */
++      set_cpu_online(smp_processor_id(), false);
++      disable_all_local_evtchn();
++
++      for (;;) {
++              if (hlt_works(smp_processor_id()))
++                      halt();
++      }
++}
++
++static void do_nothing(void *unused)
++{
++}
++
++/*
++ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
++ * pm_idle and update to new pm_idle value. Required while changing pm_idle
++ * handler on SMP systems.
++ *
++ * Caller must have changed pm_idle to the new value before the call. Old
++ * pm_idle value will not be used by any CPU after the return of this function.
++ */
++void cpu_idle_wait(void)
++{
++      smp_mb();
++      /* kick all the CPUs so that they exit out of pm_idle */
++      smp_call_function(do_nothing, NULL, 1);
++}
++EXPORT_SYMBOL_GPL(cpu_idle_wait);
++
++#ifndef CONFIG_XEN
++/*
++ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
++ * which can obviate IPI to trigger checking of need_resched.
++ * We execute MONITOR against need_resched and enter optimized wait state
++ * through MWAIT. Whenever someone changes need_resched, we would be woken
++ * up from MWAIT (without an IPI).
++ *
++ * New with Core Duo processors, MWAIT can take some hints based on CPU
++ * capability.
++ */
++void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
++{
++      if (!need_resched()) {
++              if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
++                      clflush((void *)&current_thread_info()->flags);
++
++              __monitor((void *)&current_thread_info()->flags, 0, 0);
++              smp_mb();
++              if (!need_resched())
++                      __mwait(ax, cx);
++      }
++}
++
++/* Default MONITOR/MWAIT with no hints, used for default C1 state */
++static void mwait_idle(void)
++{
++      if (!need_resched()) {
++              trace_power_start(POWER_CSTATE, 1, smp_processor_id());
++              trace_cpu_idle(1, smp_processor_id());
++              if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
++                      clflush((void *)&current_thread_info()->flags);
++
++              __monitor((void *)&current_thread_info()->flags, 0, 0);
++              smp_mb();
++              if (!need_resched())
++                      __sti_mwait(0, 0);
++              else
++                      local_irq_enable();
++              trace_power_end(smp_processor_id());
++              trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
++      } else
++              local_irq_enable();
++}
++#endif
++
++/*
++ * On SMP it's slightly faster (but much more power-consuming!)
++ * to poll the ->work.need_resched flag instead of waiting for the
++ * cross-CPU IPI to arrive. Use this option with caution.
++ */
++static void poll_idle(void)
++{
++      trace_power_start(POWER_CSTATE, 0, smp_processor_id());
++      trace_cpu_idle(0, smp_processor_id());
++      local_irq_enable();
++      while (!need_resched())
++              cpu_relax();
++      trace_power_end(smp_processor_id());
++      trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
++}
++
++#ifndef CONFIG_XEN
++/*
++ * mwait selection logic:
++ *
++ * It depends on the CPU. For AMD CPUs that support MWAIT this is
++ * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
++ * then depend on a clock divisor and current Pstate of the core. If
++ * all cores of a processor are in halt state (C1) the processor can
++ * enter the C1E (C1 enhanced) state. If mwait is used this will never
++ * happen.
++ *
++ * idle=mwait overrides this decision and forces the usage of mwait.
++ */
++
++#define MWAIT_INFO                    0x05
++#define MWAIT_ECX_EXTENDED_INFO               0x01
++#define MWAIT_EDX_C1                  0xf0
++
++int mwait_usable(const struct cpuinfo_x86 *c)
++{
++      u32 eax, ebx, ecx, edx;
++
++      if (boot_option_idle_override == IDLE_FORCE_MWAIT)
++              return 1;
++
++      if (c->cpuid_level < MWAIT_INFO)
++              return 0;
++
++      cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
++      /* Check, whether EDX has extended info about MWAIT */
++      if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
++              return 1;
++
++      /*
++       * edx enumeratios MONITOR/MWAIT extensions. Check, whether
++       * C1  supports MWAIT
++       */
++      return (edx & MWAIT_EDX_C1);
++}
++
++bool c1e_detected;
++EXPORT_SYMBOL(c1e_detected);
++
++static cpumask_var_t c1e_mask;
++
++void c1e_remove_cpu(int cpu)
++{
++      if (c1e_mask != NULL)
++              cpumask_clear_cpu(cpu, c1e_mask);
++}
++
++/*
++ * C1E aware idle routine. We check for C1E active in the interrupt
++ * pending message MSR. If we detect C1E, then we handle it the same
++ * way as C3 power states (local apic timer and TSC stop)
++ */
++static void c1e_idle(void)
++{
++      if (need_resched())
++              return;
++
++      if (!c1e_detected) {
++              u32 lo, hi;
++
++              rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
++
++              if (lo & K8_INTP_C1E_ACTIVE_MASK) {
++                      c1e_detected = true;
++                      if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
++                              mark_tsc_unstable("TSC halt in AMD C1E");
++                      printk(KERN_INFO "System has AMD C1E enabled\n");
++              }
++      }
++
++      if (c1e_detected) {
++              int cpu = smp_processor_id();
++
++              if (!cpumask_test_cpu(cpu, c1e_mask)) {
++                      cpumask_set_cpu(cpu, c1e_mask);
++                      /*
++                       * Force broadcast so ACPI can not interfere.
++                       */
++                      clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
++                                         &cpu);
++                      printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
++                             cpu);
++              }
++              clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
++
++              default_idle();
++
++              /*
++               * The switch back from broadcast mode needs to be
++               * called with interrupts disabled.
++               */
++               local_irq_disable();
++               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
++               local_irq_enable();
++      } else
++              default_idle();
++}
++#endif
++
++void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
++{
++#ifndef CONFIG_XEN
++#ifdef CONFIG_SMP
++      if (pm_idle == poll_idle && smp_num_siblings > 1) {
++              printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
++                      " performance may degrade.\n");
++      }
++#endif
++      if (pm_idle)
++              return;
++
++      if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
++              /*
++               * One CPU supports mwait => All CPUs supports mwait
++               */
++              printk(KERN_INFO "using mwait in idle threads.\n");
++              pm_idle = mwait_idle;
++      } else if (cpu_has_amd_erratum(amd_erratum_400)) {
++              /* E400: APIC timer interrupt does not wake up CPU from C1e */
++              printk(KERN_INFO "using C1E aware idle routine\n");
++              pm_idle = c1e_idle;
++      } else
++              pm_idle = default_idle;
++#endif
++}
++
++void __init init_c1e_mask(void)
++{
++#ifndef CONFIG_XEN
++      /* If we're using c1e_idle, we need to allocate c1e_mask. */
++      if (pm_idle == c1e_idle)
++              zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
++#endif
++}
++
++static int __init idle_setup(char *str)
++{
++      if (!str)
++              return -EINVAL;
++
++      if (!strcmp(str, "poll")) {
++              printk("using polling idle threads.\n");
++              pm_idle = poll_idle;
++              boot_option_idle_override = IDLE_POLL;
++#ifndef CONFIG_XEN
++      } else if (!strcmp(str, "mwait")) {
++              boot_option_idle_override = IDLE_FORCE_MWAIT;
++      } else if (!strcmp(str, "halt")) {
++              /*
++               * When the boot option of idle=halt is added, halt is
++               * forced to be used for CPU idle. In such case CPU C2/C3
++               * won't be used again.
++               * To continue to load the CPU idle driver, don't touch
++               * the boot_option_idle_override.
++               */
++              pm_idle = default_idle;
++              boot_option_idle_override = IDLE_HALT;
++      } else if (!strcmp(str, "nomwait")) {
++              /*
++               * If the boot option of "idle=nomwait" is added,
++               * it means that mwait will be disabled for CPU C2/C3
++               * states. In such case it won't touch the variable
++               * of boot_option_idle_override.
++               */
++              boot_option_idle_override = IDLE_NOMWAIT;
++#endif
++      } else
++              return -1;
++
++      return 0;
++}
++early_param("idle", idle_setup);
++
++unsigned long arch_align_stack(unsigned long sp)
++{
++      if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
++              sp -= get_random_int() % 8192;
++      return sp & ~0xf;
++}
++
++unsigned long arch_randomize_brk(struct mm_struct *mm)
++{
++      unsigned long range_end = mm->brk + 0x02000000;
++      return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
++}
++
diff --cc arch/x86/kernel/process_32-xen.c

index 0000000,0000000..eacd15d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/process_32-xen.c
@@@ -1,0 -1,0 +1,473 @@@
++/*
++ *  Copyright (C) 1995  Linus Torvalds
++ *
++ *  Pentium III FXSR, SSE support
++ *    Gareth Hughes <gareth@valinux.com>, May 2000
++ */
++
++/*
++ * This file handles the architecture-dependent parts of process handling..
++ */
++
++#include <linux/stackprotector.h>
++#include <linux/cpu.h>
++#include <linux/errno.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/elfcore.h>
++#include <linux/smp.h>
++#include <linux/stddef.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/user.h>
++#include <linux/interrupt.h>
++#include <linux/delay.h>
++#include <linux/reboot.h>
++#include <linux/init.h>
++#include <linux/mc146818rtc.h>
++#include <linux/module.h>
++#include <linux/kallsyms.h>
++#include <linux/ptrace.h>
++#include <linux/personality.h>
++#include <linux/tick.h>
++#include <linux/percpu.h>
++#include <linux/prctl.h>
++#include <linux/ftrace.h>
++#include <linux/uaccess.h>
++#include <linux/io.h>
++#include <linux/kdebug.h>
++
++#include <asm/pgtable.h>
++#include <asm/system.h>
++#include <asm/ldt.h>
++#include <asm/processor.h>
++#include <asm/i387.h>
++#include <asm/desc.h>
++#ifdef CONFIG_MATH_EMULATION
++#include <asm/math_emu.h>
++#endif
++
++#include <xen/interface/physdev.h>
++
++#include <linux/err.h>
++
++#include <asm/tlbflush.h>
++#include <asm/cpu.h>
++#include <asm/idle.h>
++#include <asm/syscalls.h>
++#include <asm/debugreg.h>
++
++asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
++asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
++
++/*
++ * Return saved PC of a blocked thread.
++ */
++unsigned long thread_saved_pc(struct task_struct *tsk)
++{
++      return ((unsigned long *)tsk->thread.sp)[3];
++}
++
++#ifndef CONFIG_SMP
++static inline void play_dead(void)
++{
++      BUG();
++}
++#endif
++
++/*
++ * The idle thread. There's no useful work to be
++ * done, so just try to conserve power and have a
++ * low exit latency (ie sit in a loop waiting for
++ * somebody to say that they'd like to reschedule)
++ */
++void cpu_idle(void)
++{
++      int cpu = smp_processor_id();
++
++      /*
++       * If we're the non-boot CPU, nothing set the stack canary up
++       * for us.  CPU0 already has it initialized but no harm in
++       * doing it again.  This is a good place for updating it, as
++       * we wont ever return from this function (so the invalid
++       * canaries already on the stack wont ever trigger).
++       */
++      boot_init_stack_canary();
++
++      current_thread_info()->status |= TS_POLLING;
++
++      /* endless idle loop with no priority at all */
++      while (1) {
++              tick_nohz_stop_sched_tick(1);
++              while (!need_resched()) {
++
++                      check_pgt_cache();
++                      rmb();
++
++                      if (cpu_is_offline(cpu))
++                              play_dead();
++
++                      local_irq_disable();
++                      /* Don't trace irqs off for idle */
++                      stop_critical_timings();
++                      xen_idle();
++                      start_critical_timings();
++              }
++              tick_nohz_restart_sched_tick();
++              preempt_enable_no_resched();
++              schedule();
++              preempt_disable();
++      }
++}
++
++void __show_regs(struct pt_regs *regs, int all)
++{
++      unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
++      unsigned long d0, d1, d2, d3, d6, d7;
++      unsigned long sp;
++      unsigned short ss, gs;
++
++      if (user_mode_vm(regs)) {
++              sp = regs->sp;
++              ss = regs->ss & 0xffff;
++              gs = get_user_gs(regs);
++      } else {
++              sp = kernel_stack_pointer(regs);
++              savesegment(ss, ss);
++              savesegment(gs, gs);
++      }
++
++      show_regs_common();
++
++      printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
++                      (u16)regs->cs, regs->ip, regs->flags,
++                      smp_processor_id());
++      print_symbol("EIP is at %s\n", regs->ip);
++
++      printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
++              regs->ax, regs->bx, regs->cx, regs->dx);
++      printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
++              regs->si, regs->di, regs->bp, sp);
++      printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
++             (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
++
++      if (!all)
++              return;
++
++      cr0 = read_cr0();
++      cr2 = read_cr2();
++      cr3 = read_cr3();
++      cr4 = read_cr4_safe();
++      printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
++                      cr0, cr2, cr3, cr4);
++
++      get_debugreg(d0, 0);
++      get_debugreg(d1, 1);
++      get_debugreg(d2, 2);
++      get_debugreg(d3, 3);
++      printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
++                      d0, d1, d2, d3);
++
++      get_debugreg(d6, 6);
++      get_debugreg(d7, 7);
++      printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
++                      d6, d7);
++}
++
++void release_thread(struct task_struct *dead_task)
++{
++      BUG_ON(dead_task->mm);
++      release_vm86_irqs(dead_task);
++}
++
++/*
++ * This gets called before we allocate a new thread and copy
++ * the current task into it.
++ */
++void prepare_to_copy(struct task_struct *tsk)
++{
++      unlazy_fpu(tsk);
++}
++
++int copy_thread(unsigned long clone_flags, unsigned long sp,
++      unsigned long unused,
++      struct task_struct *p, struct pt_regs *regs)
++{
++      struct pt_regs *childregs;
++      struct task_struct *tsk;
++      int err;
++
++      childregs = task_pt_regs(p);
++      *childregs = *regs;
++      childregs->ax = 0;
++      childregs->sp = sp;
++
++      p->thread.sp = (unsigned long) childregs;
++      p->thread.sp0 = (unsigned long) (childregs+1);
++
++      p->thread.ip = (unsigned long) ret_from_fork;
++
++      task_user_gs(p) = get_user_gs(regs);
++
++      p->thread.io_bitmap_ptr = NULL;
++      tsk = current;
++      err = -ENOMEM;
++
++      memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
++
++      if (test_tsk_thread_flag(tsk, TIF_CSTAR))
++              p->thread.ip = (unsigned long) cstar_ret_from_fork;
++      if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
++              p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
++                                              IO_BITMAP_BYTES, GFP_KERNEL);
++              if (!p->thread.io_bitmap_ptr) {
++                      p->thread.io_bitmap_max = 0;
++                      return -ENOMEM;
++              }
++              set_tsk_thread_flag(p, TIF_IO_BITMAP);
++      }
++
++      err = 0;
++
++      /*
++       * Set a new TLS for the child thread?
++       */
++      if (clone_flags & CLONE_SETTLS)
++              err = do_set_thread_area(p, -1,
++                      (struct user_desc __user *)childregs->si, 0);
++
++      p->thread.iopl = current->thread.iopl;
++
++      if (err && p->thread.io_bitmap_ptr) {
++              kfree(p->thread.io_bitmap_ptr);
++              p->thread.io_bitmap_max = 0;
++      }
++      return err;
++}
++
++void
++start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
++{
++      set_user_gs(regs, 0);
++      regs->fs                = 0;
++      set_fs(USER_DS);
++      regs->ds                = __USER_DS;
++      regs->es                = __USER_DS;
++      regs->ss                = __USER_DS;
++      regs->cs                = __USER_CS;
++      regs->ip                = new_ip;
++      regs->sp                = new_sp;
++      /*
++       * Free the old FP and other extended state
++       */
++      free_thread_xstate(current);
++}
++EXPORT_SYMBOL_GPL(start_thread);
++
++/*
++ *    switch_to(x,yn) should switch tasks from x to y.
++ *
++ * We fsave/fwait so that an exception goes off at the right time
++ * (as a call from the fsave or fwait in effect) rather than to
++ * the wrong process. Lazy FP saving no longer makes any sense
++ * with modern CPU's, and this simplifies a lot of things (SMP
++ * and UP become the same).
++ *
++ * NOTE! We used to use the x86 hardware context switching. The
++ * reason for not using it any more becomes apparent when you
++ * try to recover gracefully from saved state that is no longer
++ * valid (stale segment register values in particular). With the
++ * hardware task-switch, there is no way to fix up bad state in
++ * a reasonable manner.
++ *
++ * The fact that Intel documents the hardware task-switching to
++ * be slow is a fairly red herring - this code is not noticeably
++ * faster. However, there _is_ some room for improvement here,
++ * so the performance issues may eventually be a valid point.
++ * More important, however, is the fact that this allows us much
++ * more flexibility.
++ *
++ * The return value (in %ax) will be the "prev" task after
++ * the task-switch, and shows up in ret_from_fork in entry.S,
++ * for example.
++ */
++__notrace_funcgraph struct task_struct *
++__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
++{
++      struct thread_struct *prev = &prev_p->thread,
++                               *next = &next_p->thread;
++      int cpu = smp_processor_id();
++#ifndef CONFIG_X86_NO_TSS
++      struct tss_struct *tss = &per_cpu(init_tss, cpu);
++#endif
++      bool preload_fpu;
++#if CONFIG_XEN_COMPAT > 0x030002
++      struct physdev_set_iopl iopl_op;
++      struct physdev_set_iobitmap iobmp_op;
++#else
++      struct physdev_op _pdo[2], *pdo = _pdo;
++#define iopl_op pdo->u.set_iopl
++#define iobmp_op pdo->u.set_iobitmap
++#endif
++      multicall_entry_t _mcl[8], *mcl = _mcl;
++
++      /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
++
++      /*
++       * If the task has used fpu the last 5 timeslices, just do a full
++       * restore of the math state immediately to avoid the trap; the
++       * chances of needing FPU soon are obviously high now
++       */
++      preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
++
++      /*
++       * This is basically '__unlazy_fpu', except that we queue a
++       * multicall to indicate FPU task switch, rather than
++       * synchronously trapping to Xen.
++       */
++      if (task_thread_info(prev_p)->status & TS_USEDFPU) {
++              __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
++              if (!preload_fpu) {
++                      mcl->op      = __HYPERVISOR_fpu_taskswitch;
++                      mcl->args[0] = 1;
++                      mcl++;
++              }
++      }
++#if 0 /* lazy fpu sanity check */
++      else BUG_ON(!(read_cr0() & 8));
++#endif
++
++      /*
++       * Reload sp0.
++       * This is load_sp0(tss, next) with a multicall.
++       */
++      mcl->op      = __HYPERVISOR_stack_switch;
++      mcl->args[0] = __KERNEL_DS;
++      mcl->args[1] = next->sp0;
++      mcl++;
++
++      /*
++       * Load the per-thread Thread-Local Storage descriptor.
++       * This is load_TLS(next, cpu) with multicalls.
++       */
++#define C(i) do {                                                     \
++      if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
++                   next->tls_array[i].b != prev->tls_array[i].b)) {   \
++              mcl->op = __HYPERVISOR_update_descriptor;               \
++              *(u64 *)&mcl->args[0] = arbitrary_virt_to_machine(      \
++                      &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
++              *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i];    \
++              mcl++;                                                  \
++      }                                                               \
++} while (0)
++      C(0); C(1); C(2);
++#undef C
++
++      if (unlikely(prev->iopl != next->iopl)) {
++              iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
++#if CONFIG_XEN_COMPAT > 0x030002
++              mcl->op      = __HYPERVISOR_physdev_op;
++              mcl->args[0] = PHYSDEVOP_set_iopl;
++              mcl->args[1] = (unsigned long)&iopl_op;
++#else
++              mcl->op      = __HYPERVISOR_physdev_op_compat;
++              pdo->cmd     = PHYSDEVOP_set_iopl;
++              mcl->args[0] = (unsigned long)pdo++;
++#endif
++              mcl++;
++      }
++
++      /* If we're going to preload the fpu context, make sure clts
++         is run while we're batching the cpu state updates. */
++      if (preload_fpu) {
++              mcl->op      = __HYPERVISOR_fpu_taskswitch;
++              mcl->args[0] = 0;
++              mcl++;
++      }
++
++      if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
++              set_xen_guest_handle(iobmp_op.bitmap,
++                                   (char *)next->io_bitmap_ptr);
++              iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
++#if CONFIG_XEN_COMPAT > 0x030002
++              mcl->op      = __HYPERVISOR_physdev_op;
++              mcl->args[0] = PHYSDEVOP_set_iobitmap;
++              mcl->args[1] = (unsigned long)&iobmp_op;
++#else
++              mcl->op      = __HYPERVISOR_physdev_op_compat;
++              pdo->cmd     = PHYSDEVOP_set_iobitmap;
++              mcl->args[0] = (unsigned long)pdo++;
++#endif
++              mcl++;
++      }
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++      BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
++#endif
++      BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
++      if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
++              BUG();
++
++      /* we're going to use this soon, after a few expensive things */
++      if (preload_fpu)
++              prefetch(next->fpu.state);
++
++      /*
++       * Now maybe handle debug registers
++       */
++      if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
++                   task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
++              __switch_to_xtra(prev_p, next_p);
++
++      /*
++       * Leave lazy mode, flushing any hypercalls made here.
++       * This must be done before restoring TLS segments so
++       * the GDT and LDT are properly updated, and must be
++       * done before math_state_restore, so the TS bit is up
++       * to date.
++       */
++      arch_end_context_switch(next_p);
++
++      if (preload_fpu)
++              __math_state_restore();
++
++      /*
++       * Restore %gs if needed (which is common)
++       */
++      if (prev->gs | next->gs)
++              lazy_load_gs(next->gs);
++
++      percpu_write(current_task, next_p);
++
++      return prev_p;
++}
++
++#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
++#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
++
++unsigned long get_wchan(struct task_struct *p)
++{
++      unsigned long bp, sp, ip;
++      unsigned long stack_page;
++      int count = 0;
++      if (!p || p == current || p->state == TASK_RUNNING)
++              return 0;
++      stack_page = (unsigned long)task_stack_page(p);
++      sp = p->thread.sp;
++      if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
++              return 0;
++      /* include/asm-i386/system.h:switch_to() pushes bp last. */
++      bp = *(unsigned long *) sp;
++      do {
++              if (bp < stack_page || bp > top_ebp+stack_page)
++                      return 0;
++              ip = *(unsigned long *) (bp+4);
++              if (!in_sched_functions(ip))
++                      return ip;
++              bp = *(unsigned long *) bp;
++      } while (count++ < 16);
++      return 0;
++}
++
diff --cc arch/x86/kernel/process_64-xen.c

index 0000000,0000000..db271d8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/process_64-xen.c
@@@ -1,0 -1,0 +1,724 @@@
++/*
++ *  Copyright (C) 1995  Linus Torvalds
++ *
++ *  Pentium III FXSR, SSE support
++ *    Gareth Hughes <gareth@valinux.com>, May 2000
++ *
++ *  X86-64 port
++ *    Andi Kleen.
++ *
++ *    CPU hotplug support - ashok.raj@intel.com
++ * 
++ *  Jun Nakajima <jun.nakajima@intel.com> 
++ *     Modified for Xen
++ */
++
++/*
++ * This file handles the architecture-dependent parts of process handling..
++ */
++
++#include <linux/stackprotector.h>
++#include <linux/cpu.h>
++#include <linux/errno.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/elfcore.h>
++#include <linux/smp.h>
++#include <linux/slab.h>
++#include <linux/user.h>
++#include <linux/interrupt.h>
++#include <linux/delay.h>
++#include <linux/module.h>
++#include <linux/ptrace.h>
++#include <linux/notifier.h>
++#include <linux/kprobes.h>
++#include <linux/kdebug.h>
++#include <linux/tick.h>
++#include <linux/prctl.h>
++#include <linux/uaccess.h>
++#include <linux/io.h>
++#include <linux/ftrace.h>
++
++#include <asm/pgtable.h>
++#include <asm/system.h>
++#include <asm/processor.h>
++#include <asm/i387.h>
++#include <asm/mmu_context.h>
++#include <asm/prctl.h>
++#include <xen/interface/physdev.h>
++#include <asm/desc.h>
++#include <asm/proto.h>
++#include <asm/hardirq.h>
++#include <asm/ia32.h>
++#include <asm/idle.h>
++#include <asm/syscalls.h>
++#include <asm/debugreg.h>
++
++asmlinkage extern void ret_from_fork(void);
++
++static DEFINE_PER_CPU(unsigned char, is_idle);
++
++static ATOMIC_NOTIFIER_HEAD(idle_notifier);
++
++void idle_notifier_register(struct notifier_block *n)
++{
++      atomic_notifier_chain_register(&idle_notifier, n);
++}
++EXPORT_SYMBOL_GPL(idle_notifier_register);
++
++void idle_notifier_unregister(struct notifier_block *n)
++{
++      atomic_notifier_chain_unregister(&idle_notifier, n);
++}
++EXPORT_SYMBOL_GPL(idle_notifier_unregister);
++
++void enter_idle(void)
++{
++      percpu_write(is_idle, 1);
++      atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
++}
++
++static void __exit_idle(void)
++{
++      if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
++              return;
++      atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
++}
++
++/* Called from interrupts to signify idle end */
++void exit_idle(void)
++{
++      /* idle loop has pid 0 */
++      if (current->pid)
++              return;
++      __exit_idle();
++}
++
++#ifndef CONFIG_SMP
++static inline void play_dead(void)
++{
++      BUG();
++}
++#endif
++
++/*
++ * The idle thread. There's no useful work to be
++ * done, so just try to conserve power and have a
++ * low exit latency (ie sit in a loop waiting for
++ * somebody to say that they'd like to reschedule)
++ */
++void cpu_idle(void)
++{
++      current_thread_info()->status |= TS_POLLING;
++
++      /*
++       * If we're the non-boot CPU, nothing set the stack canary up
++       * for us.  CPU0 already has it initialized but no harm in
++       * doing it again.  This is a good place for updating it, as
++       * we wont ever return from this function (so the invalid
++       * canaries already on the stack wont ever trigger).
++       */
++      boot_init_stack_canary();
++
++      /* endless idle loop with no priority at all */
++      while (1) {
++              tick_nohz_stop_sched_tick(1);
++              while (!need_resched()) {
++
++                      rmb();
++
++                      if (cpu_is_offline(smp_processor_id()))
++                              play_dead();
++                      /*
++                       * Idle routines should keep interrupts disabled
++                       * from here on, until they go to idle.
++                       * Otherwise, idle callbacks can misfire.
++                       */
++                      local_irq_disable();
++                      enter_idle();
++                      /* Don't trace irqs off for idle */
++                      stop_critical_timings();
++                      xen_idle();
++                      start_critical_timings();
++
++                      /* In many cases the interrupt that ended idle
++                         has already called exit_idle. But some idle
++                         loops can be woken up without interrupt. */
++                      __exit_idle();
++              }
++
++              tick_nohz_restart_sched_tick();
++              preempt_enable_no_resched();
++              schedule();
++              preempt_disable();
++      }
++}
++
++/* Prints also some state that isn't saved in the pt_regs */
++void __show_regs(struct pt_regs *regs, int all)
++{
++      unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
++      unsigned long d0, d1, d2, d3, d6, d7;
++      unsigned int fsindex, gsindex;
++      unsigned int ds, cs, es;
++
++      show_regs_common();
++      printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
++      printk_address(regs->ip, 1);
++      printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
++                      regs->sp, regs->flags);
++      printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
++             regs->ax, regs->bx, regs->cx);
++      printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
++             regs->dx, regs->si, regs->di);
++      printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
++             regs->bp, regs->r8, regs->r9);
++      printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
++             regs->r10, regs->r11, regs->r12);
++      printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
++             regs->r13, regs->r14, regs->r15);
++
++      asm("movl %%ds,%0" : "=r" (ds));
++      asm("movl %%cs,%0" : "=r" (cs));
++      asm("movl %%es,%0" : "=r" (es));
++      asm("mov %%fs,%0" : "=r" (fsindex));
++      asm("mov %%gs,%0" : "=r" (gsindex));
++
++      rdmsrl(MSR_FS_BASE, fs);
++      rdmsrl(MSR_GS_BASE, gs);
++      rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
++
++      if (!all)
++              return;
++
++      cr0 = read_cr0();
++      cr2 = read_cr2();
++      cr3 = read_cr3();
++      cr4 = read_cr4();
++
++      printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
++             fs, fsindex, gs, gsindex, shadowgs);
++      printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
++                      es, cr0);
++      printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
++                      cr4);
++
++      get_debugreg(d0, 0);
++      get_debugreg(d1, 1);
++      get_debugreg(d2, 2);
++      printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
++      get_debugreg(d3, 3);
++      get_debugreg(d6, 6);
++      get_debugreg(d7, 7);
++      printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
++}
++
++void xen_load_gs_index(unsigned gs)
++{
++      WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
++}
++EXPORT_SYMBOL(xen_load_gs_index);
++
++void release_thread(struct task_struct *dead_task)
++{
++      if (dead_task->mm) {
++              if (dead_task->mm->context.size) {
++                      printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
++                                      dead_task->comm,
++                                      dead_task->mm->context.ldt,
++                                      dead_task->mm->context.size);
++                      BUG();
++              }
++      }
++}
++
++static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
++{
++      struct user_desc ud = {
++              .base_addr = addr,
++              .limit = 0xfffff,
++              .seg_32bit = 1,
++              .limit_in_pages = 1,
++              .useable = 1,
++      };
++      struct desc_struct *desc = t->thread.tls_array;
++      desc += tls;
++      fill_ldt(desc, &ud);
++}
++
++static inline u32 read_32bit_tls(struct task_struct *t, int tls)
++{
++      return get_desc_base(&t->thread.tls_array[tls]);
++}
++
++/*
++ * This gets called before we allocate a new thread and copy
++ * the current task into it.
++ */
++void prepare_to_copy(struct task_struct *tsk)
++{
++      unlazy_fpu(tsk);
++}
++
++int copy_thread(unsigned long clone_flags, unsigned long sp,
++              unsigned long unused,
++      struct task_struct *p, struct pt_regs *regs)
++{
++      int err;
++      struct pt_regs *childregs;
++      struct task_struct *me = current;
++
++      childregs = ((struct pt_regs *)
++                      (THREAD_SIZE + task_stack_page(p))) - 1;
++      *childregs = *regs;
++
++      childregs->ax = 0;
++      if (user_mode(regs))
++              childregs->sp = sp;
++      else
++              childregs->sp = (unsigned long)childregs;
++
++      p->thread.sp = (unsigned long) childregs;
++      p->thread.sp0 = (unsigned long) (childregs+1);
++
++      set_tsk_thread_flag(p, TIF_FORK);
++
++      p->thread.io_bitmap_ptr = NULL;
++
++      savesegment(gs, p->thread.gsindex);
++      p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
++      savesegment(fs, p->thread.fsindex);
++      p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
++      savesegment(es, p->thread.es);
++      savesegment(ds, p->thread.ds);
++
++      err = -ENOMEM;
++      memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
++
++      if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
++              p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
++              if (!p->thread.io_bitmap_ptr) {
++                      p->thread.io_bitmap_max = 0;
++                      return -ENOMEM;
++              }
++              memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
++                              IO_BITMAP_BYTES);
++              set_tsk_thread_flag(p, TIF_IO_BITMAP);
++      }
++
++      /*
++       * Set a new TLS for the child thread?
++       */
++      if (clone_flags & CLONE_SETTLS) {
++#ifdef CONFIG_IA32_EMULATION
++              if (test_thread_flag(TIF_IA32))
++                      err = do_set_thread_area(p, -1,
++                              (struct user_desc __user *)childregs->si, 0);
++              else
++#endif
++                      err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
++              if (err)
++                      goto out;
++      }
++        p->thread.iopl = current->thread.iopl;
++
++      err = 0;
++out:
++      if (err && p->thread.io_bitmap_ptr) {
++              kfree(p->thread.io_bitmap_ptr);
++              p->thread.io_bitmap_max = 0;
++      }
++
++      return err;
++}
++
++static void
++start_thread_common(struct pt_regs *regs, unsigned long new_ip,
++                  unsigned long new_sp,
++                  unsigned int _cs, unsigned int _ss, unsigned int _ds)
++{
++      loadsegment(fs, 0);
++      loadsegment(es, _ds);
++      loadsegment(ds, _ds);
++      load_gs_index(0);
++      regs->ip                = new_ip;
++      regs->sp                = new_sp;
++      regs->cs                = _cs;
++      regs->ss                = _ss;
++      regs->flags             = X86_EFLAGS_IF;
++      set_fs(USER_DS);
++      /*
++       * Free the old FP and other extended state
++       */
++      free_thread_xstate(current);
++}
++
++void
++start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
++{
++      start_thread_common(regs, new_ip, new_sp,
++                          __USER_CS, __USER_DS, 0);
++}
++
++#ifdef CONFIG_IA32_EMULATION
++void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
++{
++      start_thread_common(regs, new_ip, new_sp,
++                          __USER32_CS, __USER32_DS, __USER32_DS);
++}
++#endif
++
++/*
++ *    switch_to(x,y) should switch tasks from x to y.
++ *
++ * This could still be optimized:
++ * - fold all the options into a flag word and test it with a single test.
++ * - could test fs/gs bitsliced
++ *
++ * Kprobes not supported here. Set the probe on schedule instead.
++ * Function graph tracer not supported too.
++ */
++__notrace_funcgraph struct task_struct *
++__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
++{
++      struct thread_struct *prev = &prev_p->thread;
++      struct thread_struct *next = &next_p->thread;
++      int cpu = smp_processor_id();
++#ifndef CONFIG_X86_NO_TSS
++      struct tss_struct *tss = &per_cpu(init_tss, cpu);
++#endif
++      bool preload_fpu;
++#if CONFIG_XEN_COMPAT > 0x030002
++      struct physdev_set_iopl iopl_op;
++      struct physdev_set_iobitmap iobmp_op;
++#else
++      struct physdev_op _pdo[2], *pdo = _pdo;
++#define iopl_op pdo->u.set_iopl
++#define iobmp_op pdo->u.set_iobitmap
++#endif
++      multicall_entry_t _mcl[8], *mcl = _mcl;
++
++      /*
++       * If the task has used fpu the last 5 timeslices, just do a full
++       * restore of the math state immediately to avoid the trap; the
++       * chances of needing FPU soon are obviously high now
++       */
++      preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
++
++      /* we're going to use this soon, after a few expensive things */
++      if (preload_fpu)
++              prefetch(next->fpu.state);
++
++      /*
++       * This is basically '__unlazy_fpu', except that we queue a
++       * multicall to indicate FPU task switch, rather than
++       * synchronously trapping to Xen.
++       * The AMD workaround requires it to be after DS reload, or
++       * after DS has been cleared, which we do in __prepare_arch_switch.
++       */
++      if (task_thread_info(prev_p)->status & TS_USEDFPU) {
++              __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
++              if (!preload_fpu) {
++                      mcl->op      = __HYPERVISOR_fpu_taskswitch;
++                      mcl->args[0] = 1;
++                      mcl++;
++              }
++      } else
++              prev_p->fpu_counter = 0;
++
++      /* Make sure cpu is ready for new context */
++      if (preload_fpu) {
++              mcl->op      = __HYPERVISOR_fpu_taskswitch;
++              mcl->args[0] = 0;
++              mcl++;
++      }
++
++      /*
++       * Reload sp0.
++       * This is load_sp0(tss, next) with a multicall.
++       */
++      mcl->op      = __HYPERVISOR_stack_switch;
++      mcl->args[0] = __KERNEL_DS;
++      mcl->args[1] = next->sp0;
++      mcl++;
++
++      /*
++       * Load the per-thread Thread-Local Storage descriptor.
++       * This is load_TLS(next, cpu) with multicalls.
++       */
++#define C(i) do {                                                     \
++      if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
++                   next->tls_array[i].b != prev->tls_array[i].b)) {   \
++              mcl->op      = __HYPERVISOR_update_descriptor;          \
++              mcl->args[0] = arbitrary_virt_to_machine(               \
++                      &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
++              mcl->args[1] = *(u64 *)&next->tls_array[i];             \
++              mcl++;                                                  \
++      }                                                               \
++} while (0)
++      C(0); C(1); C(2);
++#undef C
++
++      if (unlikely(prev->iopl != next->iopl)) {
++              iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
++#if CONFIG_XEN_COMPAT > 0x030002
++              mcl->op      = __HYPERVISOR_physdev_op;
++              mcl->args[0] = PHYSDEVOP_set_iopl;
++              mcl->args[1] = (unsigned long)&iopl_op;
++#else
++              mcl->op      = __HYPERVISOR_physdev_op_compat;
++              pdo->cmd     = PHYSDEVOP_set_iopl;
++              mcl->args[0] = (unsigned long)pdo++;
++#endif
++              mcl++;
++      }
++
++      if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
++              set_xen_guest_handle(iobmp_op.bitmap,
++                                   (char *)next->io_bitmap_ptr);
++              iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
++#if CONFIG_XEN_COMPAT > 0x030002
++              mcl->op      = __HYPERVISOR_physdev_op;
++              mcl->args[0] = PHYSDEVOP_set_iobitmap;
++              mcl->args[1] = (unsigned long)&iobmp_op;
++#else
++              mcl->op      = __HYPERVISOR_physdev_op_compat;
++              pdo->cmd     = PHYSDEVOP_set_iobitmap;
++              mcl->args[0] = (unsigned long)pdo++;
++#endif
++              mcl++;
++      }
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++      BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
++#endif
++      BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
++      if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
++              BUG();
++
++      /*
++       * Switch DS and ES.
++       * This won't pick up thread selector changes, but I guess that is ok.
++       */
++      if (unlikely(next->es))
++              loadsegment(es, next->es);
++
++      if (unlikely(next->ds))
++              loadsegment(ds, next->ds);
++
++      /*
++       * Leave lazy mode, flushing any hypercalls made here.
++       * This must be done before restoring TLS segments so
++       * the GDT and LDT are properly updated, and must be
++       * done before math_state_restore, so the TS bit is up
++       * to date.
++       */
++      arch_end_context_switch(next_p);
++
++      /*
++       * Switch FS and GS.
++       *
++       * Segment register != 0 always requires a reload.  Also
++       * reload when it has changed.  When prev process used 64bit
++       * base always reload to avoid an information leak.
++       */
++      if (unlikely(next->fsindex))
++              loadsegment(fs, next->fsindex);
++
++      if (next->fs)
++              WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs));
++      
++      if (unlikely(next->gsindex))
++              load_gs_index(next->gsindex);
++
++      if (next->gs)
++              WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
++
++      /*
++       * Switch the PDA context.
++       */
++      percpu_write(current_task, next_p);
++
++      percpu_write(kernel_stack,
++                (unsigned long)task_stack_page(next_p) +
++                THREAD_SIZE - KERNEL_STACK_OFFSET);
++
++      /*
++       * Now maybe reload the debug registers
++       */
++      if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
++                   task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
++              __switch_to_xtra(prev_p, next_p);
++
++      /*
++       * Preload the FPU context, now that we've determined that the
++       * task is likely to be using it.
++       */
++      if (preload_fpu)
++              __math_state_restore();
++
++      return prev_p;
++}
++
++void set_personality_64bit(void)
++{
++      /* inherit personality from parent */
++
++      /* Make sure to be in 64bit mode */
++      clear_thread_flag(TIF_IA32);
++
++      /* Ensure the corresponding mm is not marked. */
++      if (current->mm)
++              current->mm->context.ia32_compat = 0;
++
++      /* TBD: overwrites user setup. Should have two bits.
++         But 64bit processes have always behaved this way,
++         so it's not too bad. The main problem is just that
++         32bit childs are affected again. */
++      current->personality &= ~READ_IMPLIES_EXEC;
++}
++
++void set_personality_ia32(void)
++{
++      /* inherit personality from parent */
++
++      /* Make sure to be in 32bit mode */
++      set_thread_flag(TIF_IA32);
++      current->personality |= force_personality32;
++
++      /* Mark the associated mm as containing 32-bit tasks. */
++      if (current->mm)
++              current->mm->context.ia32_compat = 1;
++
++      /* Prepare the first "return" to user space */
++      current_thread_info()->status |= TS_COMPAT;
++}
++
++unsigned long get_wchan(struct task_struct *p)
++{
++      unsigned long stack;
++      u64 fp, ip;
++      int count = 0;
++
++      if (!p || p == current || p->state == TASK_RUNNING)
++              return 0;
++      stack = (unsigned long)task_stack_page(p);
++      if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
++              return 0;
++      fp = *(u64 *)(p->thread.sp);
++      do {
++              if (fp < (unsigned long)stack ||
++                  fp >= (unsigned long)stack+THREAD_SIZE)
++                      return 0;
++              ip = *(u64 *)(fp+8);
++              if (!in_sched_functions(ip))
++                      return ip;
++              fp = *(u64 *)fp;
++      } while (count++ < 16);
++      return 0;
++}
++
++long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
++{
++      int ret = 0;
++      int doit = task == current;
++      int cpu;
++
++      switch (code) {
++      case ARCH_SET_GS:
++              if (addr >= TASK_SIZE_OF(task))
++                      return -EPERM;
++              cpu = get_cpu();
++              /* handle small bases via the GDT because that's faster to
++                 switch. */
++              if (addr <= 0xffffffff) {
++                      set_32bit_tls(task, GS_TLS, addr);
++                      if (doit) {
++                              load_TLS(&task->thread, cpu);
++                              load_gs_index(GS_TLS_SEL);
++                      }
++                      task->thread.gsindex = GS_TLS_SEL;
++                      task->thread.gs = 0;
++              } else {
++                      task->thread.gsindex = 0;
++                      task->thread.gs = addr;
++                      if (doit) {
++                              load_gs_index(0);
++                              ret = HYPERVISOR_set_segment_base(
++                                      SEGBASE_GS_USER, addr);
++                      }
++              }
++              put_cpu();
++              break;
++      case ARCH_SET_FS:
++              /* Not strictly needed for fs, but do it for symmetry
++                 with gs */
++              if (addr >= TASK_SIZE_OF(task))
++                      return -EPERM;
++              cpu = get_cpu();
++              /* handle small bases via the GDT because that's faster to
++                 switch. */
++              if (addr <= 0xffffffff) {
++                      set_32bit_tls(task, FS_TLS, addr);
++                      if (doit) {
++                              load_TLS(&task->thread, cpu);
++                              loadsegment(fs, FS_TLS_SEL);
++                      }
++                      task->thread.fsindex = FS_TLS_SEL;
++                      task->thread.fs = 0;
++              } else {
++                      task->thread.fsindex = 0;
++                      task->thread.fs = addr;
++                      if (doit) {
++                              /* set the selector to 0 to not confuse
++                                 __switch_to */
++                              loadsegment(fs, 0);
++                                ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
++                                                                addr);
++                      }
++              }
++              put_cpu();
++              break;
++      case ARCH_GET_FS: {
++              unsigned long base;
++              if (task->thread.fsindex == FS_TLS_SEL)
++                      base = read_32bit_tls(task, FS_TLS);
++              else if (doit)
++                      rdmsrl(MSR_FS_BASE, base);
++              else
++                      base = task->thread.fs;
++              ret = put_user(base, (unsigned long __user *)addr);
++              break;
++      }
++      case ARCH_GET_GS: {
++              unsigned long base;
++              unsigned gsindex;
++              if (task->thread.gsindex == GS_TLS_SEL)
++                      base = read_32bit_tls(task, GS_TLS);
++              else if (doit) {
++                      savesegment(gs, gsindex);
++                      if (gsindex)
++                              rdmsrl(MSR_KERNEL_GS_BASE, base);
++                      else
++                              base = task->thread.gs;
++              } else
++                      base = task->thread.gs;
++              ret = put_user(base, (unsigned long __user *)addr);
++              break;
++      }
++
++      default:
++              ret = -EINVAL;
++              break;
++      }
++
++      return ret;
++}
++
++long sys_arch_prctl(int code, unsigned long addr)
++{
++      return do_arch_prctl(current, code, addr);
++}
++
diff --cc arch/x86/kernel/quirks.c

index 8bbe8c5,8bbe8c5..d1e5747
--- 1/arch/x86/kernel/quirks.c
--- 2/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@@ -4,9 -4,9 +4,7 @@@
   #include <linux/pci.h>
   #include <linux/irq.h>
   
--#include <asm/hpet.h>
--
--#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
++#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
   
   static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
   {
@@@ -35,10 -35,10 +33,21 @@@
         if (!(word & (1 << 13))) {
                 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
                         "disabling irq balancing and affinity\n");
++#ifndef CONFIG_XEN
                 noirqdebug_setup("");
   #ifdef CONFIG_PROC_FS
                 no_irq_affinity = 1;
   #endif
++#else
++              {
++                      struct xen_platform_op op = {
++                              .cmd = XENPF_platform_quirk,
++                              .u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING
++                      };
++
++                      WARN_ON(HYPERVISOR_platform_op(&op));
++              }
++#endif
         }
   
         /* put back the original value for config space*/
@@@ -54,6 -54,6 +63,8 @@@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_I
   #endif
   
   #if defined(CONFIG_HPET_TIMER)
++#include <asm/hpet.h>
++
   unsigned long force_hpet_address;
   
   static enum {
diff --cc arch/x86/kernel/relocate_kernel_32.S

index 4123553,4123553..fe0fbfb
--- 1/arch/x86/kernel/relocate_kernel_32.S
--- 2/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@@ -87,14 -87,14 +87,32 @@@ relocate_kernel
         movl    PTR(PA_PGD)(%ebp), %eax
         movl    %eax, %cr3
   
++      /* setup idt */
++      lidtl   idt_48 - relocate_kernel(%edi)
++
++      /* setup gdt */
++      leal    gdt - relocate_kernel(%edi), %eax
++      movl    %eax, (gdt_48 - relocate_kernel) + 2(%edi)
++      lgdtl   gdt_48 - relocate_kernel(%edi)
++
++      /* setup data segment registers */
++      mov     $(gdt_ds - gdt), %eax
++      mov     %eax, %ds
++      mov     %eax, %es
++      mov     %eax, %fs
++      mov     %eax, %gs
++      mov     %eax, %ss
++
         /* setup a new stack at the end of the physical control page */
         lea     PAGE_SIZE(%edi), %esp
   
--      /* jump to identity mapped page */
++      /* load new code segment and jump to identity mapped page */
++      pushl   $0
++      pushl   $(gdt_cs - gdt)
         movl    %edi, %eax
         addl    $(identity_mapped - relocate_kernel), %eax
         pushl   %eax
--      ret
++      iretl
   
   identity_mapped:
         /* store the start address on the stack */
@@@ -271,5 -271,5 +289,22 @@@ swap_pages
         popl    %ebp
         ret
   
++      .align  16
++gdt:
++      .quad   0x0000000000000000      /* NULL descriptor */
++gdt_cs:
++      .quad   0x00cf9a000000ffff      /* kernel 4GB code at 0x00000000 */
++gdt_ds:
++      .quad   0x00cf92000000ffff      /* kernel 4GB data at 0x00000000 */
++gdt_end:
++
++gdt_48:
++      .word   gdt_end - gdt - 1       /* limit */
++      .long   0                       /* base - filled in by code above */
++
++idt_48:
++      .word   0                       /* limit */
++      .long   0                       /* base */
++
         .globl kexec_control_code_size
   .set kexec_control_code_size, . - relocate_kernel
diff --cc arch/x86/kernel/relocate_kernel_64.S

index 4de8f5b,4de8f5b..bb0455d
--- 1/arch/x86/kernel/relocate_kernel_64.S
--- 2/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@@ -91,13 -91,13 +91,30 @@@ relocate_kernel
         /* Switch to the identity mapped page tables */
         movq    %r9, %cr3
   
++      /* setup idt */
++      lidtq   idt_80 - relocate_kernel(%r8)
++
++      /* setup gdt */
++      leaq    gdt - relocate_kernel(%r8), %rax
++      movq    %rax, (gdt_80 - relocate_kernel) + 2(%r8)
++      lgdtq   gdt_80 - relocate_kernel(%r8)
++
++      /* setup data segment registers */
++      xorl    %eax, %eax
++      movl    %eax, %ds
++      movl    %eax, %es
++      movl    %eax, %fs
++      movl    %eax, %gs
++      movl    %eax, %ss
++
         /* setup a new stack at the end of the physical control page */
         lea     PAGE_SIZE(%r8), %rsp
   
--      /* jump to identity mapped page */
++      /* load new code segment and jump to identity mapped page */
         addq    $(identity_mapped - relocate_kernel), %r8
++      pushq   $(gdt_cs - gdt)
         pushq   %r8
--      ret
++      lretq
   
   identity_mapped:
         /* store the start address on the stack */
@@@ -262,5 -262,5 +279,20 @@@ swap_pages
   3:
         ret
   
++      .align  16
++gdt:
++      .quad   0x0000000000000000      /* NULL descriptor */
++gdt_cs:
++      .quad   0x00af9a000000ffff
++gdt_end:
++
++gdt_80:
++      .word   gdt_end - gdt - 1       /* limit */
++      .quad   0                       /* base - filled in by code above */
++
++idt_80:
++      .word   0                       /* limit */
++      .quad   0                       /* base */
++
         .globl kexec_control_code_size
   .set kexec_control_code_size, . - relocate_kernel
diff --cc arch/x86/kernel/rtc.c

index 3f2ad26,3f2ad26..4d2111d
--- 1/arch/x86/kernel/rtc.c
--- 2/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@@ -172,6 -172,6 +172,11 @@@ int update_persistent_clock(struct time
         unsigned long flags;
         int retval;
   
++#ifdef CONFIG_XEN
++      if (xen_update_persistent_clock() < 0 || xen_independent_wallclock())
++              return 0;
++#endif
++
         spin_lock_irqsave(&rtc_lock, flags);
         retval = x86_platform.set_wallclock(now.tv_sec);
         spin_unlock_irqrestore(&rtc_lock, flags);
@@@ -184,6 -184,6 +189,12 @@@ void read_persistent_clock(struct times
   {
         unsigned long retval, flags;
   
++#ifdef CONFIG_XEN
++      if (!is_initial_xendomain()) {
++              xen_read_persistent_clock(ts);
++              return;
++      }
++#endif
         spin_lock_irqsave(&rtc_lock, flags);
         retval = x86_platform.get_wallclock();
         spin_unlock_irqrestore(&rtc_lock, flags);
diff --cc arch/x86/kernel/setup-xen.c

index 0000000,0000000..72b189e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/setup-xen.c
@@@ -1,0 -1,0 +1,1390 @@@
++/*
++ *  Copyright (C) 1995  Linus Torvalds
++ *
++ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
++ *
++ *  Memory region support
++ *    David Parsons <orc@pell.chi.il.us>, July-August 1999
++ *
++ *  Added E820 sanitization routine (removes overlapping memory regions);
++ *  Brian Moyle <bmoyle@mvista.com>, February 2001
++ *
++ * Moved CPU detection code to cpu/${cpu}.c
++ *    Patrick Mochel <mochel@osdl.org>, March 2002
++ *
++ *  Provisions for empty E820 memory regions (reported by certain BIOSes).
++ *  Alex Achenbach <xela@slit.de>, December 2002.
++ *
++ */
++
++/*
++ * This file handles the architecture-dependent parts of initialization
++ */
++
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/mmzone.h>
++#include <linux/screen_info.h>
++#include <linux/ioport.h>
++#include <linux/acpi.h>
++#include <linux/sfi.h>
++#include <linux/apm_bios.h>
++#include <linux/initrd.h>
++#include <linux/bootmem.h>
++#include <linux/memblock.h>
++#include <linux/seq_file.h>
++#include <linux/console.h>
++#include <linux/mca.h>
++#include <linux/root_dev.h>
++#include <linux/highmem.h>
++#include <linux/module.h>
++#include <linux/efi.h>
++#include <linux/init.h>
++#include <linux/edd.h>
++#include <linux/iscsi_ibft.h>
++#include <linux/nodemask.h>
++#include <linux/kexec.h>
++#include <linux/dmi.h>
++#include <linux/pfn.h>
++#include <linux/pci.h>
++#include <asm/pci-direct.h>
++#include <linux/init_ohci1394_dma.h>
++#include <linux/kvm_para.h>
++
++#include <linux/errno.h>
++#include <linux/kernel.h>
++#include <linux/stddef.h>
++#include <linux/unistd.h>
++#include <linux/ptrace.h>
++#include <linux/user.h>
++#include <linux/delay.h>
++
++#include <linux/kallsyms.h>
++#include <linux/cpufreq.h>
++#include <linux/dma-mapping.h>
++#include <linux/ctype.h>
++#include <linux/uaccess.h>
++
++#include <linux/percpu.h>
++#include <linux/crash_dump.h>
++#include <linux/tboot.h>
++
++#include <video/edid.h>
++
++#include <asm/mtrr.h>
++#include <asm/apic.h>
++#include <asm/trampoline.h>
++#include <asm/e820.h>
++#include <asm/mpspec.h>
++#include <asm/setup.h>
++#include <asm/efi.h>
++#include <asm/timer.h>
++#include <asm/i8259.h>
++#include <asm/sections.h>
++#include <asm/dmi.h>
++#include <asm/io_apic.h>
++#include <asm/ist.h>
++#include <asm/setup_arch.h>
++#include <asm/bios_ebda.h>
++#include <asm/cacheflush.h>
++#include <asm/processor.h>
++#include <asm/bugs.h>
++
++#include <asm/system.h>
++#include <asm/vsyscall.h>
++#include <asm/cpu.h>
++#include <asm/desc.h>
++#include <asm/dma.h>
++#include <asm/iommu.h>
++#include <asm/gart.h>
++#include <asm/mmu_context.h>
++#include <asm/proto.h>
++
++#include <asm/paravirt.h>
++#include <asm/hypervisor.h>
++#include <asm/olpc_ofw.h>
++
++#include <asm/percpu.h>
++#include <asm/topology.h>
++#include <asm/apicdef.h>
++#include <asm/amd_nb.h>
++#ifdef CONFIG_X86_64
++#include <asm/numa_64.h>
++#endif
++#include <asm/mce.h>
++#include <asm/alternative.h>
++#include <asm/prom.h>
++
++#ifdef CONFIG_XEN
++#include <asm/hypervisor.h>
++#include <xen/interface/kexec.h>
++#include <xen/interface/memory.h>
++#include <xen/interface/nmi.h>
++#include <xen/interface/physdev.h>
++#include <xen/features.h>
++#include <xen/firmware.h>
++#include <xen/xencons.h>
++
++static int xen_panic_event(struct notifier_block *, unsigned long, void *);
++static struct notifier_block xen_panic_block = {
++      xen_panic_event, NULL, 0 /* try to go last */
++};
++
++unsigned long *phys_to_machine_mapping;
++EXPORT_SYMBOL(phys_to_machine_mapping);
++
++unsigned long *pfn_to_mfn_frame_list_list, **pfn_to_mfn_frame_list;
++
++/* Raw start-of-day parameters from the hypervisor. */
++start_info_t *xen_start_info;
++EXPORT_SYMBOL(xen_start_info);
++#endif
++
++/*
++ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
++ * The direct mapping extends to max_pfn_mapped, so that we can directly access
++ * apertures, ACPI and other tables without having to play with fixmaps.
++ */
++unsigned long max_low_pfn_mapped;
++unsigned long max_pfn_mapped;
++
++#ifdef CONFIG_DMI
++RESERVE_BRK(dmi_alloc, 65536);
++#endif
++
++
++static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
++unsigned long _brk_end = (unsigned long)__brk_base;
++
++#ifndef CONFIG_XEN
++#ifdef CONFIG_X86_64
++int default_cpu_present_to_apicid(int mps_cpu)
++{
++      return __default_cpu_present_to_apicid(mps_cpu);
++}
++
++int default_check_phys_apicid_present(int phys_apicid)
++{
++      return __default_check_phys_apicid_present(phys_apicid);
++}
++#endif
++
++#ifndef CONFIG_DEBUG_BOOT_PARAMS
++struct boot_params __initdata boot_params;
++#else
++struct boot_params boot_params;
++#endif
++#endif
++
++/*
++ * Machine setup..
++ */
++static struct resource data_resource = {
++      .name   = "Kernel data",
++      .start  = 0,
++      .end    = 0,
++      .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
++};
++
++static struct resource code_resource = {
++      .name   = "Kernel code",
++      .start  = 0,
++      .end    = 0,
++      .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
++};
++
++static struct resource bss_resource = {
++      .name   = "Kernel bss",
++      .start  = 0,
++      .end    = 0,
++      .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
++};
++
++
++#ifdef CONFIG_X86_32
++/* cpu data as detected by the assembly code in head.S */
++struct cpuinfo_x86 new_cpu_data __cpuinitdata = { .wp_works_ok = 1, .hard_math = 1 };
++/* common cpu data for all cpus */
++struct cpuinfo_x86 boot_cpu_data __read_mostly = { .wp_works_ok = 1, .hard_math = 1 };
++EXPORT_SYMBOL(boot_cpu_data);
++#ifndef CONFIG_XEN
++static void set_mca_bus(int x)
++{
++#ifdef CONFIG_MCA
++      MCA_bus = x;
++#endif
++}
++
++unsigned int def_to_bigsmp;
++
++/* for MCA, but anyone else can use it if they want */
++unsigned int machine_id;
++unsigned int machine_submodel_id;
++unsigned int BIOS_revision;
++
++struct apm_info apm_info;
++EXPORT_SYMBOL(apm_info);
++#endif
++
++#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
++struct ist_info ist_info;
++EXPORT_SYMBOL(ist_info);
++#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
++struct ist_info ist_info;
++#endif
++
++#else
++struct cpuinfo_x86 boot_cpu_data __read_mostly = {
++      .x86_phys_bits = MAX_PHYSMEM_BITS,
++};
++EXPORT_SYMBOL(boot_cpu_data);
++#endif
++
++
++#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
++unsigned long mmu_cr4_features;
++#else
++unsigned long mmu_cr4_features = X86_CR4_PAE;
++#endif
++
++/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
++int bootloader_type, bootloader_version;
++
++/*
++ * Setup options
++ */
++struct screen_info screen_info;
++EXPORT_SYMBOL(screen_info);
++struct edid_info edid_info;
++EXPORT_SYMBOL_GPL(edid_info);
++
++extern int root_mountflags;
++
++unsigned long saved_video_mode;
++
++#define RAMDISK_IMAGE_START_MASK      0x07FF
++#define RAMDISK_PROMPT_FLAG           0x8000
++#define RAMDISK_LOAD_FLAG             0x4000
++
++static char __initdata command_line[COMMAND_LINE_SIZE];
++#ifdef CONFIG_CMDLINE_BOOL
++static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
++#endif
++
++#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
++struct edd edd;
++#ifdef CONFIG_EDD_MODULE
++EXPORT_SYMBOL(edd);
++#endif
++#ifndef CONFIG_XEN
++/**
++ * copy_edd() - Copy the BIOS EDD information
++ *              from boot_params into a safe place.
++ *
++ */
++static inline void __init copy_edd(void)
++{
++     memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
++          sizeof(edd.mbr_signature));
++     memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
++     edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
++     edd.edd_info_nr = boot_params.eddbuf_entries;
++}
++#endif
++#else
++static inline void __init copy_edd(void)
++{
++}
++#endif
++
++void * __init extend_brk(size_t size, size_t align)
++{
++      size_t mask = align - 1;
++      void *ret;
++
++      BUG_ON(_brk_start == 0);
++      BUG_ON(align & mask);
++
++      _brk_end = (_brk_end + mask) & ~mask;
++      BUG_ON((char *)(_brk_end + size) > __brk_limit);
++
++      ret = (void *)_brk_end;
++      _brk_end += size;
++
++      memset(ret, 0, size);
++
++      return ret;
++}
++
++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
++static void __init init_gbpages(void)
++{
++      if (direct_gbpages && cpu_has_gbpages)
++              printk(KERN_INFO "Using GB pages for direct mapping\n");
++      else
++              direct_gbpages = 0;
++}
++#else
++static inline void init_gbpages(void)
++{
++}
++static void __init cleanup_highmap(void)
++{
++}
++#endif
++
++static void __init reserve_brk(void)
++{
++      if (_brk_end > _brk_start)
++              memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
++
++      /* Mark brk area as locked down and no longer taking any
++         new allocations */
++      _brk_start = 0;
++}
++
++#ifdef CONFIG_BLK_DEV_INITRD
++
++#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
++static void __init relocate_initrd(void)
++{
++#ifndef CONFIG_XEN
++      /* Assume only end is not page aligned */
++      u64 ramdisk_image = boot_params.hdr.ramdisk_image;
++      u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
++      u64 area_size     = PAGE_ALIGN(ramdisk_size);
++      u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
++      u64 ramdisk_here;
++      unsigned long slop, clen, mapaddr;
++      char *p, *q;
++
++      /* We need to move the initrd down into lowmem */
++      ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
++                                       PAGE_SIZE);
++
++      if (ramdisk_here == MEMBLOCK_ERROR)
++              panic("Cannot find place for new RAMDISK of size %lld\n",
++                       ramdisk_size);
++
++      /* Note: this includes all the lowmem currently occupied by
++         the initrd, we rely on that fact to keep the data intact. */
++      memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
++      initrd_start = ramdisk_here + PAGE_OFFSET;
++      initrd_end   = initrd_start + ramdisk_size;
++      printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
++                       ramdisk_here, ramdisk_here + ramdisk_size);
++
++      q = (char *)initrd_start;
++
++      /* Copy any lowmem portion of the initrd */
++      if (ramdisk_image < end_of_lowmem) {
++              clen = end_of_lowmem - ramdisk_image;
++              p = (char *)__va(ramdisk_image);
++              memcpy(q, p, clen);
++              q += clen;
++              ramdisk_image += clen;
++              ramdisk_size  -= clen;
++      }
++
++      /* Copy the highmem portion of the initrd */
++      while (ramdisk_size) {
++              slop = ramdisk_image & ~PAGE_MASK;
++              clen = ramdisk_size;
++              if (clen > MAX_MAP_CHUNK-slop)
++                      clen = MAX_MAP_CHUNK-slop;
++              mapaddr = ramdisk_image & PAGE_MASK;
++              p = early_memremap(mapaddr, clen+slop);
++              memcpy(q, p+slop, clen);
++              early_iounmap(p, clen+slop);
++              q += clen;
++              ramdisk_image += clen;
++              ramdisk_size  -= clen;
++      }
++      /* high pages is not converted by early_res_to_bootmem */
++      ramdisk_image = boot_params.hdr.ramdisk_image;
++      ramdisk_size  = boot_params.hdr.ramdisk_size;
++      printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
++              " %08llx - %08llx\n",
++              ramdisk_image, ramdisk_image + ramdisk_size - 1,
++              ramdisk_here, ramdisk_here + ramdisk_size - 1);
++#else
++      printk(KERN_ERR "initrd extends beyond end of memory "
++             "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
++             xen_initrd_start + xen_start_info->mod_len,
++             max_low_pfn_mapped << PAGE_SHIFT);
++      initrd_start = 0;
++#endif
++}
++
++static void __init reserve_initrd(void)
++{
++      /* Assume only end is not page aligned */
++#ifndef CONFIG_XEN
++      u64 ramdisk_image = boot_params.hdr.ramdisk_image;
++      u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
++      u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
++      u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
++
++      if (!boot_params.hdr.type_of_loader ||
++          !ramdisk_image || !ramdisk_size)
++              return;         /* No initrd provided by bootloader */
++#else
++      unsigned long ramdisk_image = xen_initrd_start;
++      unsigned long ramdisk_size  = xen_start_info->mod_len;
++      unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
++      unsigned long end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
++
++      if (!xen_start_info->mod_start || !ramdisk_size)
++              return;         /* No initrd provided by bootloader */
++#endif
++
++      initrd_start = 0;
++
++      if (ramdisk_size >= (end_of_lowmem>>1)) {
++              memblock_x86_free_range(ramdisk_image, ramdisk_end);
++              printk(KERN_ERR "initrd too large to handle, "
++                     "disabling initrd\n");
++              return;
++      }
++
++      printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
++                      ramdisk_end);
++
++
++      if (ramdisk_end <= end_of_lowmem) {
++              /* All in lowmem, easy case */
++              /*
++               * don't need to reserve again, already reserved early
++               * in i386_start_kernel
++               */
++              initrd_start = ramdisk_image + PAGE_OFFSET;
++              initrd_end = initrd_start + ramdisk_size;
++#ifdef CONFIG_X86_64_XEN
++              initrd_below_start_ok = 1;
++#endif
++              return;
++      }
++
++      relocate_initrd();
++
++      memblock_x86_free_range(ramdisk_image, ramdisk_end);
++}
++#else
++static void __init reserve_initrd(void)
++{
++}
++#endif /* CONFIG_BLK_DEV_INITRD */
++
++static void __init parse_setup_data(void)
++{
++#ifndef CONFIG_XEN
++      struct setup_data *data;
++      u64 pa_data;
++
++      if (boot_params.hdr.version < 0x0209)
++              return;
++      pa_data = boot_params.hdr.setup_data;
++      while (pa_data) {
++              u32 data_len, map_len;
++
++              map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
++                            (u64)sizeof(struct setup_data));
++              data = early_memremap(pa_data, map_len);
++              data_len = data->len + sizeof(struct setup_data);
++              if (data_len > map_len) {
++                      early_iounmap(data, map_len);
++                      data = early_memremap(pa_data, data_len);
++                      map_len = data_len;
++              }
++
++              switch (data->type) {
++              case SETUP_E820_EXT:
++                      parse_e820_ext(data);
++                      break;
++              case SETUP_DTB:
++                      add_dtb(pa_data);
++                      break;
++              default:
++                      break;
++              }
++              pa_data = data->next;
++              early_iounmap(data, map_len);
++      }
++#endif
++}
++
++static void __init e820_reserve_setup_data(void)
++{
++#ifndef CONFIG_XEN
++      struct setup_data *data;
++      u64 pa_data;
++      int found = 0;
++
++      if (boot_params.hdr.version < 0x0209)
++              return;
++      pa_data = boot_params.hdr.setup_data;
++      while (pa_data) {
++              data = early_memremap(pa_data, sizeof(*data));
++              e820_update_range(pa_data, sizeof(*data)+data->len,
++                       E820_RAM, E820_RESERVED_KERN);
++              found = 1;
++              pa_data = data->next;
++              early_iounmap(data, sizeof(*data));
++      }
++      if (!found)
++              return;
++
++      sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
++      memcpy(&e820_saved, &e820, sizeof(struct e820map));
++      printk(KERN_INFO "extended physical RAM map:\n");
++      e820_print_map("reserve setup_data");
++#endif
++}
++
++static void __init memblock_x86_reserve_range_setup_data(void)
++{
++#ifndef CONFIG_XEN
++      struct setup_data *data;
++      u64 pa_data;
++      char buf[32];
++
++      if (boot_params.hdr.version < 0x0209)
++              return;
++      pa_data = boot_params.hdr.setup_data;
++      while (pa_data) {
++              data = early_memremap(pa_data, sizeof(*data));
++              sprintf(buf, "setup data %x", data->type);
++              memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
++              pa_data = data->next;
++              early_iounmap(data, sizeof(*data));
++      }
++#endif
++}
++
++#ifndef CONFIG_XEN
++/*
++ * --------- Crashkernel reservation ------------------------------
++ */
++
++#ifdef CONFIG_KEXEC
++
++static inline unsigned long long get_total_mem(void)
++{
++      unsigned long long total;
++
++      total = max_pfn - min_low_pfn;
++
++      return total << PAGE_SHIFT;
++}
++
++/*
++ * Keep the crash kernel below this limit.  On 32 bits earlier kernels
++ * would limit the kernel to the low 512 MiB due to mapping restrictions.
++ * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
++ * limit once kexec-tools are fixed.
++ */
++#ifdef CONFIG_X86_32
++# define CRASH_KERNEL_ADDR_MAX        (512 << 20)
++#else
++# define CRASH_KERNEL_ADDR_MAX        (896 << 20)
++#endif
++
++static void __init reserve_crashkernel(void)
++{
++      unsigned long long total_mem;
++      unsigned long long crash_size, crash_base;
++      int ret;
++
++      total_mem = get_total_mem();
++
++      ret = parse_crashkernel(boot_command_line, total_mem,
++                      &crash_size, &crash_base);
++      if (ret != 0 || crash_size <= 0)
++              return;
++
++      /* 0 means: find the address automatically */
++      if (crash_base <= 0) {
++              const unsigned long long alignment = 16<<20;    /* 16M */
++
++              /*
++               *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
++               */
++              crash_base = memblock_find_in_range(alignment,
++                             CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
++
++              if (crash_base == MEMBLOCK_ERROR) {
++                      pr_info("crashkernel reservation failed - No suitable area found.\n");
++                      return;
++              }
++      } else {
++              unsigned long long start;
++
++              start = memblock_find_in_range(crash_base,
++                               crash_base + crash_size, crash_size, 1<<20);
++              if (start != crash_base) {
++                      pr_info("crashkernel reservation failed - memory is in use.\n");
++                      return;
++              }
++      }
++      memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
++
++      printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
++                      "for crashkernel (System RAM: %ldMB)\n",
++                      (unsigned long)(crash_size >> 20),
++                      (unsigned long)(crash_base >> 20),
++                      (unsigned long)(total_mem >> 20));
++
++      crashk_res.start = crash_base;
++      crashk_res.end   = crash_base + crash_size - 1;
++      insert_resource(&iomem_resource, &crashk_res);
++}
++#else
++static void __init reserve_crashkernel(void)
++{
++}
++#endif
++#endif /* CONFIG_XEN */
++
++static struct resource standard_io_resources[] = {
++      { .name = "dma1", .start = 0x00, .end = 0x1f,
++              .flags = IORESOURCE_BUSY | IORESOURCE_IO },
++      { .name = "pic1", .start = 0x20, .end = 0x21,
++              .flags = IORESOURCE_BUSY | IORESOURCE_IO },
++      { .name = "timer0", .start = 0x40, .end = 0x43,
++              .flags = IORESOURCE_BUSY | IORESOURCE_IO },
++      { .name = "timer1", .start = 0x50, .end = 0x53,
++              .flags = IORESOURCE_BUSY | IORESOURCE_IO },
++      { .name = "keyboard", .start = 0x60, .end = 0x60,
++              .flags = IORESOURCE_BUSY | IORESOURCE_IO },
++      { .name = "keyboard", .start = 0x64, .end = 0x64,
++              .flags = IORESOURCE_BUSY | IORESOURCE_IO },
++      { .name = "dma page reg", .start = 0x80, .end = 0x8f,
++              .flags = IORESOURCE_BUSY | IORESOURCE_IO },
++      { .name = "pic2", .start = 0xa0, .end = 0xa1,
++              .flags = IORESOURCE_BUSY | IORESOURCE_IO },
++      { .name = "dma2", .start = 0xc0, .end = 0xdf,
++              .flags = IORESOURCE_BUSY | IORESOURCE_IO },
++      { .name = "fpu", .start = 0xf0, .end = 0xff,
++              .flags = IORESOURCE_BUSY | IORESOURCE_IO }
++};
++
++void __init reserve_standard_io_resources(void)
++{
++      int i;
++
++      /* Nothing to do if not running in dom0. */
++      if (!is_initial_xendomain())
++              return;
++
++      /* request I/O space for devices used on all i[345]86 PCs */
++      for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
++              request_resource(&ioport_resource, &standard_io_resources[i]);
++
++}
++
++static __init void reserve_ibft_region(void)
++{
++      unsigned long addr, size = 0;
++
++      addr = find_ibft_region(&size);
++
++#ifndef CONFIG_XEN
++      if (size)
++              memblock_x86_reserve_range(addr, addr + size, "* ibft");
++#endif
++}
++
++#ifndef CONFIG_XEN
++static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
++
++static void __init trim_bios_range(void)
++{
++      /*
++       * A special case is the first 4Kb of memory;
++       * This is a BIOS owned area, not kernel ram, but generally
++       * not listed as such in the E820 table.
++       *
++       * This typically reserves additional memory (64KiB by default)
++       * since some BIOSes are known to corrupt low memory.  See the
++       * Kconfig help text for X86_RESERVE_LOW.
++       */
++      e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
++                        E820_RAM, E820_RESERVED);
++
++      /*
++       * special case: Some BIOSen report the PC BIOS
++       * area (640->1Mb) as ram even though it is not.
++       * take them out.
++       */
++      e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
++      sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
++}
++
++static int __init parse_reservelow(char *p)
++{
++      unsigned long long size;
++
++      if (!p)
++              return -EINVAL;
++
++      size = memparse(p, &p);
++
++      if (size < 4096)
++              size = 4096;
++
++      if (size > 640*1024)
++              size = 640*1024;
++
++      reserve_low = size;
++
++      return 0;
++}
++
++early_param("reservelow", parse_reservelow);
++#endif
++
++/*
++ * Determine if we were loaded by an EFI loader.  If so, then we have also been
++ * passed the efi memmap, systab, etc., so we should use these data structures
++ * for initialization.  Note, the efi init code path is determined by the
++ * global efi_enabled. This allows the same kernel image to be used on existing
++ * systems (with a traditional BIOS) as well as on EFI systems.
++ */
++/*
++ * setup_arch - architecture-specific boot-time initializations
++ *
++ * Note: On x86_64, fixmaps are ready for use even before this is called.
++ */
++
++void __init setup_arch(char **cmdline_p)
++{
++      unsigned long flags;
++#ifdef CONFIG_XEN
++      unsigned int i;
++      unsigned long p2m_pages;
++      struct physdev_set_iopl set_iopl;
++
++      if (!is_initial_xendomain()) {
++#ifdef CONFIG_X86_32
++              /* Force a quick death if the kernel panics (not domain 0). */
++              extern int panic_timeout;
++              if (!panic_timeout)
++                      panic_timeout = 1;
++#endif
++
++              /* Register a call for panic conditions. */
++              atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
++      }
++
++      set_iopl.iopl = 1;
++      WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
++#endif /* CONFIG_XEN */
++
++#ifdef CONFIG_X86_32
++      memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
++      visws_early_detect();
++
++#ifndef CONFIG_XEN
++      /*
++       * copy kernel address range established so far and switch
++       * to the proper swapper page table
++       */
++      clone_pgd_range(swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
++                      initial_page_table + KERNEL_PGD_BOUNDARY,
++                      KERNEL_PGD_PTRS);
++
++      load_cr3(swapper_pg_dir);
++      __flush_tlb_all();
++#endif
++#else
++      printk(KERN_INFO "Command line: %s\n", boot_command_line);
++#endif
++
++      /*
++       * If we have OLPC OFW, we might end up relocating the fixmap due to
++       * reserve_top(), so do this before touching the ioremap area.
++       */
++      olpc_ofw_detect();
++
++      early_trap_init();
++      early_cpu_init();
++      early_ioremap_init();
++
++      setup_olpc_ofw_pgd();
++
++#ifndef CONFIG_XEN
++      ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
++      screen_info = boot_params.screen_info;
++      edid_info = boot_params.edid_info;
++#ifdef CONFIG_X86_32
++      apm_info.bios = boot_params.apm_bios_info;
++      ist_info = boot_params.ist_info;
++      if (boot_params.sys_desc_table.length != 0) {
++              set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
++              machine_id = boot_params.sys_desc_table.table[0];
++              machine_submodel_id = boot_params.sys_desc_table.table[1];
++              BIOS_revision = boot_params.sys_desc_table.table[2];
++      }
++#endif
++      saved_video_mode = boot_params.hdr.vid_mode;
++      bootloader_type = boot_params.hdr.type_of_loader;
++      if ((bootloader_type >> 4) == 0xe) {
++              bootloader_type &= 0xf;
++              bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
++      }
++      bootloader_version  = bootloader_type & 0xf;
++      bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
++
++#ifdef CONFIG_BLK_DEV_RAM
++      rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
++      rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
++      rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
++#endif
++#ifdef CONFIG_EFI
++      if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
++#ifdef CONFIG_X86_32
++                   "EL32",
++#else
++                   "EL64",
++#endif
++       4)) {
++              efi_enabled = 1;
++              efi_memblock_x86_reserve_range();
++      }
++#endif
++#else /* CONFIG_XEN */
++#ifdef CONFIG_X86_32
++      /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
++         properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
++      */
++      ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
++#else
++      ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
++#endif
++      if (is_initial_xendomain()) {
++              const struct dom0_vga_console_info *info =
++                      (void *)((char *)xen_start_info +
++                               xen_start_info->console.dom0.info_off);
++
++              dom0_init_screen_info(info,
++                                    xen_start_info->console.dom0.info_size);
++              xen_start_info->console.domU.mfn = 0;
++              xen_start_info->console.domU.evtchn = 0;
++      } else
++              screen_info.orig_video_isVGA = 0;
++      copy_edid();
++#endif /* CONFIG_XEN */
++
++      x86_init.oem.arch_setup();
++
++      iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
++      setup_memory_map();
++      parse_setup_data();
++      /* update the e820_saved too */
++      e820_reserve_setup_data();
++
++      copy_edd();
++
++#ifndef CONFIG_XEN
++      if (!boot_params.hdr.root_flags)
++              root_mountflags &= ~MS_RDONLY;
++#endif
++      init_mm.start_code = (unsigned long) _text;
++      init_mm.end_code = (unsigned long) _etext;
++      init_mm.end_data = (unsigned long) _edata;
++      init_mm.brk = _brk_end;
++
++      code_resource.start = virt_to_phys(_text);
++      code_resource.end = virt_to_phys(_etext)-1;
++      data_resource.start = virt_to_phys(_etext);
++      data_resource.end = virt_to_phys(_edata)-1;
++      bss_resource.start = virt_to_phys(&__bss_start);
++      bss_resource.end = virt_to_phys(&__bss_stop)-1;
++
++#ifdef CONFIG_CMDLINE_BOOL
++#ifdef CONFIG_CMDLINE_OVERRIDE
++      strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
++#else
++      if (builtin_cmdline[0]) {
++              /* append boot loader cmdline to builtin */
++              strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
++              strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
++              strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
++      }
++#endif
++#endif
++
++      strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
++      *cmdline_p = command_line;
++
++      /*
++       * x86_configure_nx() is called before parse_early_param() to detect
++       * whether hardware doesn't support NX (so that the early EHCI debug
++       * console setup can safely call set_fixmap()). It may then be called
++       * again from within noexec_setup() during parsing early parameters
++       * to honor the respective command line option.
++       */
++      x86_configure_nx();
++
++      parse_early_param();
++
++      x86_report_nx();
++
++      /* after early param, so could get panic from serial */
++      memblock_x86_reserve_range_setup_data();
++
++      if (acpi_mps_check()) {
++#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
++              disable_apic = 1;
++#endif
++              setup_clear_cpu_cap(X86_FEATURE_APIC);
++      }
++
++#ifdef CONFIG_PCI
++      if (pci_early_dump_regs)
++              early_dump_pci_devices();
++#endif
++
++      finish_e820_parsing();
++
++      if (efi_enabled)
++              efi_init();
++
++      if (is_initial_xendomain())
++              dmi_scan_machine();
++
++      /*
++       * VMware detection requires dmi to be available, so this
++       * needs to be done after dmi_scan_machine, for the BP.
++       */
++      init_hypervisor_platform();
++
++      x86_init.resources.probe_roms();
++
++#ifndef CONFIG_XEN
++      /* after parse_early_param, so could debug it */
++      insert_resource(&iomem_resource, &code_resource);
++      insert_resource(&iomem_resource, &data_resource);
++      insert_resource(&iomem_resource, &bss_resource);
++
++      trim_bios_range();
++#ifdef CONFIG_X86_32
++      if (ppro_with_ram_bug()) {
++              e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
++                                E820_RESERVED);
++              sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
++              printk(KERN_INFO "fixed physical RAM map:\n");
++              e820_print_map("bad_ppro");
++      }
++#else
++      early_gart_iommu_check();
++#endif
++#endif /* CONFIG_XEN */
++
++      /*
++       * partially used pages are not usable - thus
++       * we are rounding upwards:
++       */
++      max_pfn = e820_end_of_ram_pfn();
++
++      /* update e820 for memory not covered by WB MTRRs */
++      mtrr_bp_init();
++#ifndef CONFIG_XEN
++      if (mtrr_trim_uncached_memory(max_pfn))
++              max_pfn = e820_end_of_ram_pfn();
++#endif
++
++#ifdef CONFIG_X86_32
++      /* max_low_pfn get updated here */
++      find_low_pfn_range();
++#else
++      num_physpages = max_pfn;
++      max_mapnr = max_pfn;
++
++#ifdef CONFIG_X86_LOCAL_APIC
++      check_x2apic();
++#endif
++
++      /* How many end-of-memory variables you have, grandma! */
++      /* need this before calling reserve_initrd */
++      if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
++              max_low_pfn = e820_end_of_low_ram_pfn();
++      else
++              max_low_pfn = max_pfn;
++
++      high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
++#endif
++
++      /*
++       * Find and reserve possible boot-time SMP configuration:
++       */
++      find_smp_config();
++
++      reserve_ibft_region();
++
++      /*
++       * Need to conclude brk, before memblock_x86_fill()
++       *  it could use memblock_find_in_range, could overlap with
++       *  brk area.
++       */
++      reserve_brk();
++
++      cleanup_highmap();
++
++      memblock.current_limit = get_max_mapped();
++      memblock_x86_fill();
++
++      /* preallocate 4k for mptable mpc */
++      early_reserve_e820_mpc_new();
++
++#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
++      setup_bios_corruption_check();
++#endif
++
++      printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
++                      max_pfn_mapped<<PAGE_SHIFT);
++
++#ifndef CONFIG_XEN
++      setup_trampolines();
++#endif
++
++      init_gbpages();
++
++      /* max_pfn_mapped is updated here */
++      max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
++      max_pfn_mapped = max_low_pfn_mapped;
++
++#ifdef CONFIG_X86_64
++      if (max_pfn > max_low_pfn) {
++              max_pfn_mapped = init_memory_mapping(1UL<<32,
++                                                   max_pfn<<PAGE_SHIFT);
++              /* can we preseve max_low_pfn ?*/
++              max_low_pfn = max_pfn;
++      }
++#endif
++      memblock.current_limit = get_max_mapped();
++
++      /*
++       * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
++       */
++
++#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
++      if (init_ohci1394_dma_early)
++              init_ohci1394_dma_on_all_controllers();
++#endif
++
++      reserve_initrd();
++
++#ifndef CONFIG_XEN
++      reserve_crashkernel();
++
++      vsmp_init();
++#endif
++
++      io_delay_init();
++
++#ifdef CONFIG_ACPI
++      if (!is_initial_xendomain()) {
++              printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
++              disable_acpi();
++      }
++#endif
++
++      /*
++       * Parse the ACPI tables for possible boot-time SMP configuration.
++       */
++      acpi_boot_table_init();
++
++      early_acpi_boot_init();
++
++      initmem_init();
++      memblock_find_dma_reserve();
++      dma32_reserve_bootmem();
++
++#ifdef CONFIG_KVM_CLOCK
++      kvmclock_init();
++#endif
++
++      x86_init.paging.pagetable_setup_start(swapper_pg_dir);
++      paging_init();
++      x86_init.paging.pagetable_setup_done(swapper_pg_dir);
++
++      if (boot_cpu_data.cpuid_level >= 0) {
++              /* A CPU has %cr4 if and only if it has CPUID */
++              mmu_cr4_features = read_cr4();
++      }
++
++#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
++      /* sync back kernel address range */
++      clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
++                      swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
++                      KERNEL_PGD_PTRS);
++#endif
++
++      tboot_probe();
++
++#ifdef CONFIG_X86_64
++      map_vsyscall();
++#endif
++
++#ifdef CONFIG_XEN
++#ifdef CONFIG_KEXEC
++      xen_machine_kexec_setup_resources();
++#endif
++      p2m_pages = max_pfn;
++      if (xen_start_info->nr_pages > max_pfn) {
++              /*
++               * the max_pfn was shrunk (probably by mem= or highmem=
++               * kernel parameter); shrink reservation with the HV
++               */
++              struct xen_memory_reservation reservation = {
++                      .address_bits = 0,
++                      .extent_order = 0,
++                      .domid = DOMID_SELF
++              };
++              unsigned int difference;
++              int ret;
++
++              difference = xen_start_info->nr_pages - max_pfn;
++
++              set_xen_guest_handle(reservation.extent_start,
++                                   phys_to_machine_mapping + max_pfn);
++              reservation.nr_extents = difference;
++              ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
++                                         &reservation);
++              BUG_ON(ret != difference);
++      }
++      else if (max_pfn > xen_start_info->nr_pages)
++              p2m_pages = xen_start_info->nr_pages;
++
++      if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++              unsigned long i, j, size;
++              unsigned int k, fpp;
++
++              /* Make sure we have a large enough P->M table. */
++              phys_to_machine_mapping = alloc_bootmem_pages(
++                      max_pfn * sizeof(unsigned long));
++              memcpy(phys_to_machine_mapping,
++                     __va(__pa(xen_start_info->mfn_list)),
++                     p2m_pages * sizeof(unsigned long));
++              memset(phys_to_machine_mapping + p2m_pages, ~0,
++                     (max_pfn - p2m_pages) * sizeof(unsigned long));
++
++#ifdef CONFIG_X86_64
++              if (xen_start_info->mfn_list == VMEMMAP_START) {
++                      /*
++                       * Since it is well isolated we can (and since it is
++                       * perhaps large we should) also free the page tables
++                       * mapping the initial P->M table.
++                       */
++                      unsigned long va = VMEMMAP_START, pa;
++                      pgd_t *pgd = pgd_offset_k(va);
++                      pud_t *pud_page = pud_offset(pgd, 0);
++
++                      BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK);
++                      xen_l4_entry_update(pgd, __pgd(0));
++                      for(;;) {
++                              pud_t *pud = pud_page + pud_index(va);
++
++                              if (pud_none(*pud))
++                                      va += PUD_SIZE;
++                              else if (pud_large(*pud)) {
++                                      pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
++                                      make_pages_writable(__va(pa),
++                                              PUD_SIZE >> PAGE_SHIFT,
++                                              XENFEAT_writable_page_tables);
++                                      free_bootmem(pa, PUD_SIZE);
++                                      va += PUD_SIZE;
++                              } else {
++                                      pmd_t *pmd = pmd_offset(pud, va);
++
++                                      if (pmd_large(*pmd)) {
++                                              pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
++                                              make_pages_writable(__va(pa),
++                                                      PMD_SIZE >> PAGE_SHIFT,
++                                                      XENFEAT_writable_page_tables);
++                                              free_bootmem(pa, PMD_SIZE);
++                                      } else if (!pmd_none(*pmd)) {
++                                              pte_t *pte = pte_offset_kernel(pmd, va);
++
++                                              for (i = 0; i < PTRS_PER_PTE; ++i) {
++                                                      if (pte_none(pte[i]))
++                                                              break;
++                                                      pa = pte_pfn(pte[i]) << PAGE_SHIFT;
++                                                      make_page_writable(__va(pa),
++                                                              XENFEAT_writable_page_tables);
++                                                      free_bootmem(pa, PAGE_SIZE);
++                                              }
++                                              ClearPagePinned(virt_to_page(pte));
++                                              make_page_writable(pte,
++                                                      XENFEAT_writable_page_tables);
++                                              free_bootmem(__pa(pte), PAGE_SIZE);
++                                      }
++                                      va += PMD_SIZE;
++                                      if (pmd_index(va))
++                                              continue;
++                                      ClearPagePinned(virt_to_page(pmd));
++                                      make_page_writable(pmd,
++                                              XENFEAT_writable_page_tables);
++                                      free_bootmem(__pa((unsigned long)pmd
++                                                        & PAGE_MASK),
++                                              PAGE_SIZE);
++                              }
++                              if (!pud_index(va))
++                                      break;
++                      }
++                      ClearPagePinned(virt_to_page(pud_page));
++                      make_page_writable(pud_page,
++                              XENFEAT_writable_page_tables);
++                      free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK),
++                              PAGE_SIZE);
++              } else if (!WARN_ON(xen_start_info->mfn_list
++                                  < __START_KERNEL_map))
++#endif
++                      free_bootmem(__pa(xen_start_info->mfn_list),
++                              PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
++                                              sizeof(unsigned long))));
++
++
++              /*
++               * Initialise the list of the frames that specify the list of
++               * frames that make up the p2m table. Used by save/restore.
++               */
++              fpp = PAGE_SIZE/sizeof(unsigned long);
++              size = (max_pfn + fpp - 1) / fpp;
++              size = (size + fpp - 1) / fpp;
++              ++size; /* include a zero terminator for crash tools */
++              size *= sizeof(unsigned long);
++              pfn_to_mfn_frame_list_list = alloc_bootmem_pages(size);
++              if (size > PAGE_SIZE
++                  && xen_create_contiguous_region((unsigned long)
++                                                  pfn_to_mfn_frame_list_list,
++                                                  get_order(size), 0))
++                      BUG();
++              size -= sizeof(unsigned long);
++              pfn_to_mfn_frame_list = alloc_bootmem(size);
++
++              for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
++                      if (j == fpp)
++                              j = 0;
++                      if (j == 0) {
++                              k++;
++                              BUG_ON(k * sizeof(unsigned long) >= size);
++                              pfn_to_mfn_frame_list[k] =
++                                      alloc_bootmem_pages(PAGE_SIZE);
++                              pfn_to_mfn_frame_list_list[k] =
++                                      virt_to_mfn(pfn_to_mfn_frame_list[k]);
++                      }
++                      pfn_to_mfn_frame_list[k][j] =
++                              virt_to_mfn(&phys_to_machine_mapping[i]);
++              }
++              HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
++              HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
++                      virt_to_mfn(pfn_to_mfn_frame_list_list);
++      }
++
++      /* Mark all ISA DMA channels in-use - using them wouldn't work. */
++      for (i = 0; i < MAX_DMA_CHANNELS; ++i)
++              if (i != 4 && request_dma(i, "xen") != 0)
++                      BUG();
++#else /* CONFIG_XEN */
++      generic_apic_probe();
++
++      early_quirks();
++#endif
++
++      /*
++       * Read APIC and some other early information from ACPI tables.
++       */
++      acpi_boot_init();
++      sfi_init();
++      x86_dtb_init();
++
++      /*
++       * get boot-time SMP configuration:
++       */
++      if (smp_found_config)
++              get_smp_config();
++
++      prefill_possible_map();
++
++      init_cpu_to_node();
++
++#ifndef CONFIG_XEN
++      init_apic_mappings();
++      ioapic_and_gsi_init();
++
++      kvm_guest_init();
++
++      e820_reserve_resources();
++      e820_mark_nosave_regions(max_low_pfn);
++#else
++      if (is_initial_xendomain())
++              e820_reserve_resources();
++#endif
++
++      x86_init.resources.reserve_resources();
++
++#ifndef CONFIG_XEN
++      e820_setup_gap();
++
++#ifdef CONFIG_VT
++#if defined(CONFIG_VGA_CONSOLE)
++      if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
++              conswitchp = &vga_con;
++#elif defined(CONFIG_DUMMY_CONSOLE)
++      conswitchp = &dummy_con;
++#endif
++#endif
++#else /* CONFIG_XEN */
++      if (is_initial_xendomain())
++              e820_setup_gap();
++
++#ifdef CONFIG_VT
++#ifdef CONFIG_DUMMY_CONSOLE
++      conswitchp = &dummy_con;
++#endif
++#ifdef CONFIG_VGA_CONSOLE
++      if (is_initial_xendomain())
++              conswitchp = &vga_con;
++#endif
++#endif
++#endif /* CONFIG_XEN */
++      x86_init.oem.banner();
++
++      x86_init.timers.wallclock_init();
++
++      mcheck_init();
++
++      local_irq_save(flags);
++      arch_init_ideal_nop5();
++      local_irq_restore(flags);
++}
++
++#ifdef CONFIG_X86_32
++
++static struct resource video_ram_resource = {
++      .name   = "Video RAM area",
++      .start  = 0xa0000,
++      .end    = 0xbffff,
++      .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
++};
++
++void __init i386_reserve_resources(void)
++{
++      if (is_initial_xendomain())
++              request_resource(&iomem_resource, &video_ram_resource);
++      reserve_standard_io_resources();
++}
++
++#endif /* CONFIG_X86_32 */
++
++#ifdef CONFIG_XEN
++static int
++xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
++{
++      HYPERVISOR_shutdown(SHUTDOWN_crash);
++      /* we're never actually going to get here... */
++      return NOTIFY_DONE;
++}
++#endif /* !CONFIG_XEN */
diff --cc arch/x86/kernel/setup_percpu.c

index 71f4727,71f4727..d748199
--- 1/arch/x86/kernel/setup_percpu.c
--- 2/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@@ -219,6 -219,6 +219,7 @@@ void __init setup_per_cpu_areas(void
                  * are zeroed indicating that the static arrays are
                  * gone.
                  */
++#ifndef CONFIG_XEN
   #ifdef CONFIG_X86_LOCAL_APIC
                 per_cpu(x86_cpu_to_apicid, cpu) =
                         early_per_cpu_map(x86_cpu_to_apicid, cpu);
@@@ -229,6 -229,6 +230,7 @@@
                 per_cpu(x86_cpu_to_logical_apicid, cpu) =
                         early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
   #endif
++#endif
   #ifdef CONFIG_X86_64
                 per_cpu(irq_stack_ptr, cpu) =
                         per_cpu(irq_stack_union.irq_stack, cpu) +
@@@ -256,6 -256,6 +258,7 @@@
         }
   
         /* indicate the early static arrays will soon be gone */
++#ifndef CONFIG_XEN
   #ifdef CONFIG_X86_LOCAL_APIC
         early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
         early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
@@@ -263,6 -263,6 +266,7 @@@
   #ifdef CONFIG_X86_32
         early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
   #endif
++#endif
   #ifdef CONFIG_NUMA
         early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
   #endif
diff --cc arch/x86/kernel/smp-xen.c

index 0000000,0000000..e4df2c7

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/smp-xen.c
@@@ -1,0 -1,0 +1,195 @@@
++/*
++ *    Intel SMP support routines.
++ *
++ *    (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
++ *    (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
++ *      (c) 2002,2003 Andi Kleen, SuSE Labs.
++ *
++ *    i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
++ *
++ *    This code is released under the GNU General Public License version 2 or
++ *    later.
++ */
++
++#include <linux/init.h>
++
++#include <linux/mm.h>
++#include <linux/delay.h>
++#include <linux/spinlock.h>
++#include <linux/kernel_stat.h>
++#include <linux/mc146818rtc.h>
++#include <linux/cache.h>
++#include <linux/interrupt.h>
++#include <linux/cpu.h>
++#include <linux/gfp.h>
++
++#include <asm/mtrr.h>
++#include <asm/tlbflush.h>
++#include <asm/mmu_context.h>
++#include <asm/proto.h>
++#include <asm/ipi.h>
++#include <xen/evtchn.h>
++/*
++ *    Some notes on x86 processor bugs affecting SMP operation:
++ *
++ *    Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
++ *    The Linux implications for SMP are handled as follows:
++ *
++ *    Pentium III / [Xeon]
++ *            None of the E1AP-E3AP errata are visible to the user.
++ *
++ *    E1AP.   see PII A1AP
++ *    E2AP.   see PII A2AP
++ *    E3AP.   see PII A3AP
++ *
++ *    Pentium II / [Xeon]
++ *            None of the A1AP-A3AP errata are visible to the user.
++ *
++ *    A1AP.   see PPro 1AP
++ *    A2AP.   see PPro 2AP
++ *    A3AP.   see PPro 7AP
++ *
++ *    Pentium Pro
++ *            None of 1AP-9AP errata are visible to the normal user,
++ *    except occasional delivery of 'spurious interrupt' as trap #15.
++ *    This is very rare and a non-problem.
++ *
++ *    1AP.    Linux maps APIC as non-cacheable
++ *    2AP.    worked around in hardware
++ *    3AP.    fixed in C0 and above steppings microcode update.
++ *            Linux does not use excessive STARTUP_IPIs.
++ *    4AP.    worked around in hardware
++ *    5AP.    symmetric IO mode (normal Linux operation) not affected.
++ *            'noapic' mode has vector 0xf filled out properly.
++ *    6AP.    'noapic' mode might be affected - fixed in later steppings
++ *    7AP.    We do not assume writes to the LVT deassering IRQs
++ *    8AP.    We do not enable low power mode (deep sleep) during MP bootup
++ *    9AP.    We do not use mixed mode
++ *
++ *    Pentium
++ *            There is a marginal case where REP MOVS on 100MHz SMP
++ *    machines with B stepping processors can fail. XXX should provide
++ *    an L1cache=Writethrough or L1cache=off option.
++ *
++ *            B stepping CPUs may hang. There are hardware work arounds
++ *    for this. We warn about it in case your board doesn't have the work
++ *    arounds. Basically that's so I can tell anyone with a B stepping
++ *    CPU and SMP problems "tough".
++ *
++ *    Specific items [From Pentium Processor Specification Update]
++ *
++ *    1AP.    Linux doesn't use remote read
++ *    2AP.    Linux doesn't trust APIC errors
++ *    3AP.    We work around this
++ *    4AP.    Linux never generated 3 interrupts of the same priority
++ *            to cause a lost local interrupt.
++ *    5AP.    Remote read is never used
++ *    6AP.    not affected - worked around in hardware
++ *    7AP.    not affected - worked around in hardware
++ *    8AP.    worked around in hardware - we get explicit CS errors if not
++ *    9AP.    only 'noapic' mode affected. Might generate spurious
++ *            interrupts, we log only the first one and count the
++ *            rest silently.
++ *    10AP.   not affected - worked around in hardware
++ *    11AP.   Linux reads the APIC between writes to avoid this, as per
++ *            the documentation. Make sure you preserve this as it affects
++ *            the C stepping chips too.
++ *    12AP.   not affected - worked around in hardware
++ *    13AP.   not affected - worked around in hardware
++ *    14AP.   we always deassert INIT during bootup
++ *    15AP.   not affected - worked around in hardware
++ *    16AP.   not affected - worked around in hardware
++ *    17AP.   not affected - worked around in hardware
++ *    18AP.   not affected - worked around in hardware
++ *    19AP.   not affected - worked around in BIOS
++ *
++ *    If this sounds worrying believe me these bugs are either ___RARE___,
++ *    or are signal timing bugs worked around in hardware and there's
++ *    about nothing of note with C stepping upwards.
++ */
++
++/*
++ * this function sends a 'reschedule' IPI to another CPU.
++ * it goes straight through and wastes no time serializing
++ * anything. Worst case is that we lose a reschedule ...
++ */
++void xen_smp_send_reschedule(int cpu)
++{
++      if (unlikely(cpu_is_offline(cpu))) {
++              WARN_ON(1);
++              return;
++      }
++      xen_send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
++}
++
++void xen_send_call_func_single_ipi(int cpu)
++{
++      xen_send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR);
++}
++
++void xen_send_call_func_ipi(const struct cpumask *mask)
++{
++      xen_send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR);
++}
++
++/*
++ * this function calls the 'stop' function on all other CPUs in the system.
++ */
++
++void smp_reboot_interrupt(struct pt_regs *regs)
++{
++      stop_this_cpu(NULL);
++}
++
++void xen_stop_other_cpus(int wait)
++{
++      unsigned long flags;
++      unsigned long timeout;
++
++      /*
++       * Use an own vector here because smp_call_function
++       * does lots of things not suitable in a panic situation.
++       * On most systems we could also use an NMI here,
++       * but there are a few systems around where NMI
++       * is problematic so stay with an non NMI for now
++       * (this implies we cannot stop CPUs spinning with irq off
++       * currently)
++       */
++      if (num_online_cpus() > 1) {
++              xen_send_IPI_allbutself(REBOOT_VECTOR);
++
++              /*
++               * Don't wait longer than a second if the caller
++               * didn't ask us to wait.
++               */
++              timeout = USEC_PER_SEC;
++              while (num_online_cpus() > 1 && (wait || timeout--))
++                      udelay(1);
++      }
++
++      local_irq_save(flags);
++      disable_all_local_evtchn();
++      local_irq_restore(flags);
++}
++
++/*
++ * Reschedule call back. Nothing to do,
++ * all the work is done automatically when
++ * we return from the interrupt.
++ */
++void smp_reschedule_interrupt(struct pt_regs *regs)
++{
++      inc_irq_stat(irq_resched_count);
++}
++
++void smp_call_function_interrupt(struct pt_regs *regs)
++{
++      generic_smp_call_function_interrupt();
++      inc_irq_stat(irq_call_count);
++}
++
++void smp_call_function_single_interrupt(struct pt_regs *regs)
++{
++      generic_smp_call_function_single_interrupt();
++      inc_irq_stat(irq_call_count);
++}
diff --cc arch/x86/kernel/time-xen.c

index 0000000,0000000..9860a7e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/time-xen.c
@@@ -1,0 -1,0 +1,624 @@@
++/*
++ *  Copyright (c) 1991,1992,1995  Linus Torvalds
++ *  Copyright (c) 1994  Alan Modra
++ *  Copyright (c) 1995  Markus Kuhn
++ *  Copyright (c) 1996  Ingo Molnar
++ *  Copyright (c) 1998  Andrea Arcangeli
++ *  Copyright (c) 2002,2006  Vojtech Pavlik
++ *  Copyright (c) 2003  Andi Kleen
++ *
++ */
++
++#include <linux/init.h>
++#include <linux/interrupt.h>
++#include <linux/time.h>
++#include <linux/sysctl.h>
++#include <linux/percpu.h>
++#include <linux/kernel_stat.h>
++#include <linux/posix-timers.h>
++#include <linux/cpufreq.h>
++#include <linux/clocksource.h>
++#include <linux/sysdev.h>
++
++#include <asm/vsyscall.h>
++#include <asm/delay.h>
++#include <asm/time.h>
++#include <asm/timer.h>
++
++#include <xen/clock.h>
++#include <xen/sysctl.h>
++#include <xen/interface/vcpu.h>
++
++#include <asm/i8253.h>
++DEFINE_RAW_SPINLOCK(i8253_lock);
++EXPORT_SYMBOL(i8253_lock);
++
++#ifdef CONFIG_X86_64
++volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
++#endif
++
++#define XEN_SHIFT 22
++
++unsigned int cpu_khz; /* Detected as we calibrate the TSC */
++EXPORT_SYMBOL(cpu_khz);
++
++/* These are peridically updated in shared_info, and then copied here. */
++struct shadow_time_info {
++      u64 tsc_timestamp;     /* TSC at last update of time vals.  */
++      u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
++      u32 tsc_to_nsec_mul;
++      u32 tsc_to_usec_mul;
++      int tsc_shift;
++      u32 version;
++};
++static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
++static struct timespec shadow_tv;
++static u32 shadow_tv_version;
++
++static u64 jiffies_bias, system_time_bias;
++
++/* Current runstate of each CPU (updated automatically by the hypervisor). */
++DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
++
++/* Must be signed, as it's compared with s64 quantities which can be -ve. */
++#define NS_PER_TICK (1000000000LL/HZ)
++
++/* Does this guest OS track Xen time, or set its wall clock independently? */
++static int independent_wallclock = 0;
++static int __init __independent_wallclock(char *str)
++{
++      independent_wallclock = 1;
++      return 1;
++}
++__setup("independent_wallclock", __independent_wallclock);
++
++int xen_independent_wallclock(void)
++{
++      return independent_wallclock;
++}
++
++/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
++static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
++static int __init __permitted_clock_jitter(char *str)
++{
++      permitted_clock_jitter = simple_strtoul(str, NULL, 0);
++      return 1;
++}
++__setup("permitted_clock_jitter=", __permitted_clock_jitter);
++
++/*
++ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
++ * yielding a 64-bit result.
++ */
++static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
++{
++      u64 product;
++#ifdef __i386__
++      u32 tmp1, tmp2;
++#endif
++
++      if (shift < 0)
++              delta >>= -shift;
++      else
++              delta <<= shift;
++
++#ifdef __i386__
++      __asm__ (
++              "mul  %5       ; "
++              "mov  %4,%%eax ; "
++              "mov  %%edx,%4 ; "
++              "mul  %5       ; "
++              "xor  %5,%5    ; "
++              "add  %4,%%eax ; "
++              "adc  %5,%%edx ; "
++              : "=A" (product), "=r" (tmp1), "=r" (tmp2)
++              : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
++#else
++      __asm__ (
++              "mul %%rdx ; shrd $32,%%rdx,%%rax"
++              : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
++#endif
++
++      return product;
++}
++
++static inline u64 get64(volatile u64 *ptr)
++{
++#ifndef CONFIG_64BIT
++      u64 res;
++      __asm__("movl %%ebx,%%eax\n"
++              "movl %%ecx,%%edx\n"
++              LOCK_PREFIX "cmpxchg8b %1"
++              : "=&A" (res) : "m" (*ptr));
++      return res;
++#else
++      return *ptr;
++#endif
++}
++
++static inline u64 get64_local(volatile u64 *ptr)
++{
++#ifndef CONFIG_64BIT
++      u64 res;
++      __asm__("movl %%ebx,%%eax\n"
++              "movl %%ecx,%%edx\n"
++              "cmpxchg8b %1"
++              : "=&A" (res) : "m" (*ptr));
++      return res;
++#else
++      return *ptr;
++#endif
++}
++
++static void init_cpu_khz(void)
++{
++      u64 __cpu_khz = 1000000ULL << 32;
++      struct vcpu_time_info *info = &vcpu_info(0)->time;
++      do_div(__cpu_khz, info->tsc_to_system_mul);
++      if (info->tsc_shift < 0)
++              cpu_khz = __cpu_khz << -info->tsc_shift;
++      else
++              cpu_khz = __cpu_khz >> info->tsc_shift;
++}
++
++static u64 get_nsec_offset(struct shadow_time_info *shadow)
++{
++      u64 now, delta;
++      rdtscll(now);
++      delta = now - shadow->tsc_timestamp;
++      return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
++}
++
++static inline u64 processed_system_time(u64 jiffies_64)
++{
++      return (jiffies_64 - jiffies_bias) * NS_PER_TICK + system_time_bias;
++}
++
++static void update_wallclock(void)
++{
++      static DEFINE_MUTEX(uwc_mutex);
++      shared_info_t *s = HYPERVISOR_shared_info;
++
++      mutex_lock(&uwc_mutex);
++
++      do {
++              shadow_tv_version = s->wc_version;
++              rmb();
++              shadow_tv.tv_sec  = s->wc_sec;
++              shadow_tv.tv_nsec = s->wc_nsec;
++              rmb();
++      } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
++
++      if (!independent_wallclock) {
++              u64 tmp = processed_system_time(get_jiffies_64());
++              long nsec = do_div(tmp, NSEC_PER_SEC);
++              struct timespec tv;
++
++              set_normalized_timespec(&tv, shadow_tv.tv_sec + tmp,
++                                      shadow_tv.tv_nsec + nsec);
++              do_settimeofday(&tv);
++      }
++
++      mutex_unlock(&uwc_mutex);
++}
++
++static void _update_wallclock(struct work_struct *unused)
++{
++      update_wallclock();
++}
++static DECLARE_WORK(update_wallclock_work, _update_wallclock);
++
++void xen_check_wallclock_update(void)
++{
++      if (shadow_tv_version != HYPERVISOR_shared_info->wc_version
++          && keventd_up())
++              schedule_work(&update_wallclock_work);
++}
++
++/*
++ * Reads a consistent set of time-base values from Xen, into a shadow data
++ * area.
++ */
++static void get_time_values_from_xen(unsigned int cpu)
++{
++      struct vcpu_time_info   *src;
++      struct shadow_time_info *dst;
++      unsigned long flags;
++      u32 pre_version, post_version;
++
++      src = &vcpu_info(cpu)->time;
++      dst = &per_cpu(shadow_time, cpu);
++
++      local_irq_save(flags);
++
++      do {
++              pre_version = dst->version = src->version;
++              rmb();
++              dst->tsc_timestamp     = src->tsc_timestamp;
++              dst->system_timestamp  = src->system_time;
++              dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
++              dst->tsc_shift         = src->tsc_shift;
++              rmb();
++              post_version = src->version;
++      } while ((pre_version & 1) | (pre_version ^ post_version));
++
++      dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
++
++      local_irq_restore(flags);
++}
++
++static inline int time_values_up_to_date(void)
++{
++      rmb();
++      return percpu_read(shadow_time.version) == vcpu_info_read(time.version);
++}
++
++static void sync_xen_wallclock(unsigned long dummy);
++static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
++static void sync_xen_wallclock(unsigned long dummy)
++{
++      struct timespec now, ignore;
++      struct xen_platform_op op;
++
++      BUG_ON(!is_initial_xendomain());
++      if (!ntp_synced() || independent_wallclock)
++              return;
++
++      get_xtime_and_monotonic_and_sleep_offset(&now, &ignore, &ignore);
++      set_normalized_timespec(&now, now.tv_sec, now.tv_nsec);
++
++      op.cmd = XENPF_settime;
++      op.u.settime.secs        = now.tv_sec;
++      op.u.settime.nsecs       = now.tv_nsec;
++      op.u.settime.system_time = processed_system_time(get_jiffies_64());
++      WARN_ON(HYPERVISOR_platform_op(&op));
++
++      update_wallclock();
++
++      /* Once per minute. */
++      mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
++}
++
++unsigned long long xen_local_clock(void)
++{
++      unsigned int cpu = get_cpu();
++      struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
++      u64 time;
++      u32 local_time_version;
++
++      do {
++              local_time_version = shadow->version;
++              rdtsc_barrier();
++              time = shadow->system_timestamp + get_nsec_offset(shadow);
++              if (!time_values_up_to_date())
++                      get_time_values_from_xen(cpu);
++              barrier();
++      } while (local_time_version != shadow->version);
++
++      put_cpu();
++
++      return time;
++}
++
++/*
++ * Runstate accounting
++ */
++void get_runstate_snapshot(struct vcpu_runstate_info *res)
++{
++      u64 state_time;
++      struct vcpu_runstate_info *state;
++
++      BUG_ON(preemptible());
++
++      state = &__get_cpu_var(runstate);
++
++      do {
++              state_time = get64_local(&state->state_entry_time);
++              *res = *state;
++      } while (get64_local(&state->state_entry_time) != state_time);
++
++      WARN_ON_ONCE(res->state != RUNSTATE_running);
++}
++
++/*
++ * Xen sched_clock implementation.  Returns the number of unstolen
++ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
++ * states.
++ */
++unsigned long long sched_clock(void)
++{
++      struct vcpu_runstate_info runstate;
++      cycle_t now;
++      u64 ret;
++      s64 offset;
++
++      /*
++       * Ideally sched_clock should be called on a per-cpu basis
++       * anyway, so preempt should already be disabled, but that's
++       * not current practice at the moment.
++       */
++      preempt_disable();
++
++      now = xen_local_clock();
++
++      get_runstate_snapshot(&runstate);
++
++      offset = now - runstate.state_entry_time;
++      if (offset < 0)
++              offset = 0;
++
++      ret = offset + runstate.time[RUNSTATE_running]
++            + runstate.time[RUNSTATE_blocked];
++
++      preempt_enable();
++
++      return ret;
++}
++
++unsigned long profile_pc(struct pt_regs *regs)
++{
++      unsigned long pc = instruction_pointer(regs);
++
++      if (!user_mode_vm(regs) && in_lock_functions(pc)) {
++#ifdef CONFIG_FRAME_POINTER
++              return *(unsigned long *)(regs->bp + sizeof(long));
++#else
++              unsigned long *sp =
++                      (unsigned long *)kernel_stack_pointer(regs);
++
++              /*
++               * Return address is either directly at stack pointer
++               * or above a saved flags. Eflags has bits 22-31 zero,
++               * kernel addresses don't.
++               */
++              if (sp[0] >> 22)
++                      return sp[0];
++              if (sp[1] >> 22)
++                      return sp[1];
++#endif
++      }
++
++      return pc;
++}
++EXPORT_SYMBOL(profile_pc);
++
++void mark_tsc_unstable(char *reason)
++{
++#ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
++      tsc_unstable = 1;
++#endif
++}
++EXPORT_SYMBOL_GPL(mark_tsc_unstable);
++
++static cycle_t cs_last;
++
++static cycle_t xen_clocksource_read(struct clocksource *cs)
++{
++#ifdef CONFIG_SMP
++      cycle_t last = get64(&cs_last);
++      cycle_t ret = xen_local_clock();
++
++      if (unlikely((s64)(ret - last) < 0)) {
++              if (last - ret > permitted_clock_jitter
++                  && printk_ratelimit()) {
++                      unsigned int cpu = get_cpu();
++                      struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
++
++                      printk(KERN_WARNING "clocksource/%u: "
++                             "Time went backwards: "
++                             "ret=%Lx delta=%Ld shadow=%Lx offset=%Lx\n",
++                             cpu, ret, ret - last, shadow->system_timestamp,
++                             get_nsec_offset(shadow));
++                      put_cpu();
++              }
++              return last;
++      }
++
++      for (;;) {
++              cycle_t cur = cmpxchg64(&cs_last, last, ret);
++
++              if (cur == last || (s64)(ret - cur) < 0)
++                      return ret;
++              last = cur;
++      }
++#else
++      return xen_local_clock();
++#endif
++}
++
++/* No locking required. Interrupts are disabled on all CPUs. */
++static void xen_clocksource_resume(struct clocksource *cs)
++{
++      unsigned int cpu;
++
++      init_cpu_khz();
++
++      for_each_online_cpu(cpu)
++              get_time_values_from_xen(cpu);
++
++      jiffies_bias = get_jiffies_64();
++      system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
++
++      cs_last = xen_local_clock();
++}
++
++static struct clocksource clocksource_xen = {
++      .name                   = "xen",
++      .rating                 = 400,
++      .read                   = xen_clocksource_read,
++      .mask                   = CLOCKSOURCE_MASK(64),
++      .mult                   = 1 << XEN_SHIFT,               /* time directly in nanoseconds */
++      .shift                  = XEN_SHIFT,
++      .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
++      .resume                 = xen_clocksource_resume,
++};
++
++struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu)
++{
++      struct vcpu_register_runstate_memory_area area;
++      struct vcpu_runstate_info *rs = &per_cpu(runstate, cpu);
++      int rc;
++
++      set_xen_guest_handle(area.addr.h, rs);
++      rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
++      if (rc) {
++              BUILD_BUG_ON(RUNSTATE_running);
++              memset(rs, 0, sizeof(*rs));
++              WARN_ON(rc != -ENOSYS);
++      }
++
++      return rs;
++}
++
++void xen_read_persistent_clock(struct timespec *ts)
++{
++      const shared_info_t *s = HYPERVISOR_shared_info;
++      u32 version, sec, nsec;
++      u64 delta;
++
++      do {
++              version = s->wc_version;
++              rmb();
++              sec     = s->wc_sec;
++              nsec    = s->wc_nsec;
++              rmb();
++      } while ((s->wc_version & 1) | (version ^ s->wc_version));
++
++      delta = xen_local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
++      do_div(delta, NSEC_PER_SEC);
++
++      ts->tv_sec = delta;
++      ts->tv_nsec = 0;
++}
++
++int xen_update_persistent_clock(void)
++{
++      if (!is_initial_xendomain())
++              return -1;
++      mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
++      return 0;
++}
++
++static void __init _late_time_init(void)
++{
++      update_wallclock();
++      xen_clockevents_init();
++}
++
++void __init time_init(void)
++{
++      init_cpu_khz();
++      printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
++             cpu_khz / 1000, cpu_khz % 1000);
++
++      setup_runstate_area(0);
++      get_time_values_from_xen(0);
++
++      jiffies_bias     = jiffies_64;
++      system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
++
++      clocksource_register(&clocksource_xen);
++
++      use_tsc_delay();
++
++      /*
++       * Cannot request_irq() until kmem is initialised, and cannot
++       * do_settimeofday() (i.e. clock_was_set()) until interrupts are on.
++       */
++      late_time_init = _late_time_init;
++}
++
++/* Convert jiffies to system time. */
++u64 jiffies_to_st(unsigned long j)
++{
++      u64 j64 = get_jiffies_64();
++      long delta = j - (unsigned long)j64;
++
++      if (delta < 1)
++              /* Triggers in some wrap-around cases, but that's okay:
++               * we just end up with a shorter timeout. */
++              return processed_system_time(j64) + NS_PER_TICK;
++
++      if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0)
++              /* Very long timeout means there is no pending timer.
++               * We indicate this to Xen by passing zero timeout. */
++              return 0;
++
++      return processed_system_time(j64) + delta * (u64)NS_PER_TICK;
++}
++EXPORT_SYMBOL(jiffies_to_st);
++
++#ifdef CONFIG_CPU_FREQ
++static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 
++                              void *data)
++{
++      struct cpufreq_freqs *freq = data;
++      struct xen_platform_op op;
++
++      if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
++              return 0;
++
++      if (val == CPUFREQ_PRECHANGE)
++              return 0;
++
++      op.cmd = XENPF_change_freq;
++      op.u.change_freq.flags = 0;
++      op.u.change_freq.cpu = freq->cpu;
++      op.u.change_freq.freq = (u64)freq->new * 1000;
++      WARN_ON(HYPERVISOR_platform_op(&op));
++
++      return 0;
++}
++
++static struct notifier_block time_cpufreq_notifier_block = {
++      .notifier_call = time_cpufreq_notifier
++};
++
++static int __init cpufreq_time_setup(void)
++{
++      if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
++                      CPUFREQ_TRANSITION_NOTIFIER)) {
++              printk(KERN_ERR "failed to set up cpufreq notifier\n");
++              return -ENODEV;
++      }
++      return 0;
++}
++
++core_initcall(cpufreq_time_setup);
++#endif
++
++/*
++ * /proc/sys/xen: This really belongs in another file. It can stay here for
++ * now however.
++ */
++static ctl_table xen_subtable[] = {
++      {
++              .procname       = "independent_wallclock",
++              .data           = &independent_wallclock,
++              .maxlen         = sizeof(independent_wallclock),
++              .mode           = 0644,
++              .proc_handler   = proc_dointvec
++      },
++      {
++              .procname       = "permitted_clock_jitter",
++              .data           = &permitted_clock_jitter,
++              .maxlen         = sizeof(permitted_clock_jitter),
++              .mode           = 0644,
++              .proc_handler   = proc_doulongvec_minmax
++      },
++      { }
++};
++static ctl_table xen_table[] = {
++      {
++              .procname       = "xen",
++              .mode           = 0555,
++              .child          = xen_subtable
++      },
++      { }
++};
++static int __init xen_sysctl_init(void)
++{
++      (void)register_sysctl_table(xen_table);
++      return 0;
++}
++__initcall(xen_sysctl_init);
diff --cc arch/x86/kernel/traps-xen.c

index 0000000,0000000..885104d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/traps-xen.c
@@@ -1,0 -1,0 +1,888 @@@
++/*
++ *  Copyright (C) 1991, 1992  Linus Torvalds
++ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
++ *
++ *  Pentium III FXSR, SSE support
++ *    Gareth Hughes <gareth@valinux.com>, May 2000
++ */
++
++/*
++ * Handle hardware traps and faults.
++ */
++#include <linux/interrupt.h>
++#include <linux/kallsyms.h>
++#include <linux/spinlock.h>
++#include <linux/kprobes.h>
++#include <linux/uaccess.h>
++#include <linux/kdebug.h>
++#include <linux/kgdb.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/ptrace.h>
++#include <linux/string.h>
++#include <linux/delay.h>
++#include <linux/errno.h>
++#include <linux/kexec.h>
++#include <linux/sched.h>
++#include <linux/timer.h>
++#include <linux/init.h>
++#include <linux/bug.h>
++#include <linux/nmi.h>
++#include <linux/mm.h>
++#include <linux/smp.h>
++#include <linux/io.h>
++
++#ifdef CONFIG_EISA
++#include <linux/ioport.h>
++#include <linux/eisa.h>
++#endif
++
++#ifdef CONFIG_MCA
++#include <linux/mca.h>
++#endif
++
++#if defined(CONFIG_EDAC)
++#include <linux/edac.h>
++#endif
++
++#include <asm/kmemcheck.h>
++#include <asm/stacktrace.h>
++#include <asm/processor.h>
++#include <asm/debugreg.h>
++#include <asm/atomic.h>
++#include <asm/system.h>
++#include <asm/traps.h>
++#include <asm/desc.h>
++#include <asm/i387.h>
++#include <asm/mce.h>
++
++#include <asm/mach_traps.h>
++
++#ifdef CONFIG_X86_64
++#include <asm/x86_init.h>
++#include <asm/pgalloc.h>
++#include <asm/proto.h>
++#else
++#include <asm/processor-flags.h>
++#include <asm/setup.h>
++
++asmlinkage int system_call(void);
++
++/* Do we ignore FPU interrupts ? */
++char ignore_fpu_irq;
++
++#ifndef CONFIG_X86_NO_IDT
++/*
++ * The IDT has to be page-aligned to simplify the Pentium
++ * F0 0F bug workaround.
++ */
++gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
++#endif
++#endif
++
++#ifndef CONFIG_XEN
++DECLARE_BITMAP(used_vectors, NR_VECTORS);
++EXPORT_SYMBOL_GPL(used_vectors);
++#endif
++
++static int ignore_nmis;
++
++int unknown_nmi_panic;
++/*
++ * Prevent NMI reason port (0x61) being accessed simultaneously, can
++ * only be used in NMI handler.
++ */
++static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
++
++static inline void conditional_sti(struct pt_regs *regs)
++{
++      if (regs->flags & X86_EFLAGS_IF)
++              local_irq_enable();
++}
++
++static inline void preempt_conditional_sti(struct pt_regs *regs)
++{
++      inc_preempt_count();
++      if (regs->flags & X86_EFLAGS_IF)
++              local_irq_enable();
++}
++
++static inline void conditional_cli(struct pt_regs *regs)
++{
++      if (regs->flags & X86_EFLAGS_IF)
++              local_irq_disable();
++}
++
++static inline void preempt_conditional_cli(struct pt_regs *regs)
++{
++      if (regs->flags & X86_EFLAGS_IF)
++              local_irq_disable();
++      dec_preempt_count();
++}
++
++static void __kprobes
++do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
++      long error_code, siginfo_t *info)
++{
++      struct task_struct *tsk = current;
++
++#ifdef CONFIG_X86_32
++      if (regs->flags & X86_VM_MASK) {
++              /*
++               * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
++               * On nmi (interrupt 2), do_trap should not be called.
++               */
++              if (trapnr < 6)
++                      goto vm86_trap;
++              goto trap_signal;
++      }
++#endif
++
++      if (!user_mode(regs))
++              goto kernel_trap;
++
++#ifdef CONFIG_X86_32
++trap_signal:
++#endif
++      /*
++       * We want error_code and trap_no set for userspace faults and
++       * kernelspace faults which result in die(), but not
++       * kernelspace faults which are fixed up.  die() gives the
++       * process no chance to handle the signal and notice the
++       * kernel fault information, so that won't result in polluting
++       * the information about previously queued, but not yet
++       * delivered, faults.  See also do_general_protection below.
++       */
++      tsk->thread.error_code = error_code;
++      tsk->thread.trap_no = trapnr;
++
++#ifdef CONFIG_X86_64
++      if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
++          printk_ratelimit()) {
++              printk(KERN_INFO
++                     "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
++                     tsk->comm, tsk->pid, str,
++                     regs->ip, regs->sp, error_code);
++              print_vma_addr(" in ", regs->ip);
++              printk("\n");
++      }
++#endif
++
++      if (info)
++              force_sig_info(signr, info, tsk);
++      else
++              force_sig(signr, tsk);
++      return;
++
++kernel_trap:
++      if (!fixup_exception(regs)) {
++              tsk->thread.error_code = error_code;
++              tsk->thread.trap_no = trapnr;
++              die(str, regs, error_code);
++      }
++      return;
++
++#ifdef CONFIG_X86_32
++vm86_trap:
++      if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
++                                              error_code, trapnr))
++              goto trap_signal;
++      return;
++#endif
++}
++
++#define DO_ERROR(trapnr, signr, str, name)                            \
++dotraplinkage void do_##name(struct pt_regs *regs, long error_code)   \
++{                                                                     \
++      if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)  \
++                                                      == NOTIFY_STOP) \
++              return;                                                 \
++      conditional_sti(regs);                                          \
++      do_trap(trapnr, signr, str, regs, error_code, NULL);            \
++}
++
++#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)               \
++dotraplinkage void do_##name(struct pt_regs *regs, long error_code)   \
++{                                                                     \
++      siginfo_t info;                                                 \
++      info.si_signo = signr;                                          \
++      info.si_errno = 0;                                              \
++      info.si_code = sicode;                                          \
++      info.si_addr = (void __user *)siaddr;                           \
++      if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)  \
++                                                      == NOTIFY_STOP) \
++              return;                                                 \
++      conditional_sti(regs);                                          \
++      do_trap(trapnr, signr, str, regs, error_code, &info);           \
++}
++
++DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
++DO_ERROR(4, SIGSEGV, "overflow", overflow)
++DO_ERROR(5, SIGSEGV, "bounds", bounds)
++DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
++DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
++DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
++DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
++#ifdef CONFIG_X86_32
++DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
++#endif
++DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
++
++#ifdef CONFIG_X86_64
++/* Runs on IST stack */
++dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
++{
++      if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
++                      12, SIGBUS) == NOTIFY_STOP)
++              return;
++      preempt_conditional_sti(regs);
++      do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
++      preempt_conditional_cli(regs);
++}
++
++dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
++{
++      static const char str[] = "double fault";
++      struct task_struct *tsk = current;
++
++      /* Return not checked because double check cannot be ignored */
++      notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
++
++      tsk->thread.error_code = error_code;
++      tsk->thread.trap_no = 8;
++
++      /*
++       * This is always a kernel trap and never fixable (and thus must
++       * never return).
++       */
++      for (;;)
++              die(str, regs, error_code);
++}
++#endif
++
++dotraplinkage void __kprobes
++do_general_protection(struct pt_regs *regs, long error_code)
++{
++      struct task_struct *tsk;
++
++      conditional_sti(regs);
++
++#ifdef CONFIG_X86_32
++      if (regs->flags & X86_VM_MASK)
++              goto gp_in_vm86;
++#endif
++
++      tsk = current;
++      if (!user_mode(regs))
++              goto gp_in_kernel;
++
++      tsk->thread.error_code = error_code;
++      tsk->thread.trap_no = 13;
++
++      if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
++                      printk_ratelimit()) {
++              printk(KERN_INFO
++                      "%s[%d] general protection ip:%lx sp:%lx error:%lx",
++                      tsk->comm, task_pid_nr(tsk),
++                      regs->ip, regs->sp, error_code);
++              print_vma_addr(" in ", regs->ip);
++              printk("\n");
++      }
++
++      force_sig(SIGSEGV, tsk);
++      return;
++
++#ifdef CONFIG_X86_32
++gp_in_vm86:
++      local_irq_enable();
++      handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
++      return;
++#endif
++
++gp_in_kernel:
++      if (fixup_exception(regs))
++              return;
++
++      tsk->thread.error_code = error_code;
++      tsk->thread.trap_no = 13;
++      if (notify_die(DIE_GPF, "general protection fault", regs,
++                              error_code, 13, SIGSEGV) == NOTIFY_STOP)
++              return;
++      die("general protection fault", regs, error_code);
++}
++
++static int __init setup_unknown_nmi_panic(char *str)
++{
++      unknown_nmi_panic = 1;
++      return 1;
++}
++__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
++
++static notrace __kprobes void
++pci_serr_error(unsigned char reason, struct pt_regs *regs)
++{
++      pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
++               reason, smp_processor_id());
++
++      /*
++       * On some machines, PCI SERR line is used to report memory
++       * errors. EDAC makes use of it.
++       */
++#if defined(CONFIG_EDAC)
++      if (edac_handler_set()) {
++              edac_atomic_assert_error();
++              return;
++      }
++#endif
++
++      if (panic_on_unrecovered_nmi)
++              panic("NMI: Not continuing");
++
++      pr_emerg("Dazed and confused, but trying to continue\n");
++
++      /* Clear and disable the PCI SERR error line. */
++      clear_serr_error(reason);
++}
++
++static notrace __kprobes void
++io_check_error(unsigned char reason, struct pt_regs *regs)
++{
++      pr_emerg(
++      "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
++               reason, smp_processor_id());
++      show_registers(regs);
++
++      if (panic_on_io_nmi)
++              panic("NMI IOCK error: Not continuing");
++
++      /* Re-enable the IOCK line, wait for a few seconds */
++      clear_io_check_error(reason);
++}
++
++static notrace __kprobes void
++unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
++{
++      if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
++                      NOTIFY_STOP)
++              return;
++#ifdef CONFIG_MCA
++      /*
++       * Might actually be able to figure out what the guilty party
++       * is:
++       */
++      if (MCA_bus) {
++              mca_handle_nmi();
++              return;
++      }
++#endif
++      pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
++               reason, smp_processor_id());
++
++      pr_emerg("Do you have a strange power saving mode enabled?\n");
++      if (unknown_nmi_panic || panic_on_unrecovered_nmi)
++              panic("NMI: Not continuing");
++
++      pr_emerg("Dazed and confused, but trying to continue\n");
++}
++
++static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
++{
++      unsigned char reason = 0;
++
++      /*
++       * CPU-specific NMI must be processed before non-CPU-specific
++       * NMI, otherwise we may lose it, because the CPU-specific
++       * NMI can not be detected/processed on other CPUs.
++       */
++      if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
++              return;
++
++      /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
++      raw_spin_lock(&nmi_reason_lock);
++      reason = get_nmi_reason();
++
++      if (reason & NMI_REASON_MASK) {
++              if (reason & NMI_REASON_SERR)
++                      pci_serr_error(reason, regs);
++              else if (reason & NMI_REASON_IOCHK)
++                      io_check_error(reason, regs);
++#ifdef CONFIG_X86_32
++              /*
++               * Reassert NMI in case it became active
++               * meanwhile as it's edge-triggered:
++               */
++              reassert_nmi();
++#endif
++              raw_spin_unlock(&nmi_reason_lock);
++              return;
++      }
++      raw_spin_unlock(&nmi_reason_lock);
++
++      unknown_nmi_error(reason, regs);
++}
++
++dotraplinkage notrace __kprobes void
++do_nmi(struct pt_regs *regs, long error_code)
++{
++      nmi_enter();
++
++      inc_irq_stat(__nmi_count);
++
++      if (!ignore_nmis)
++              default_do_nmi(regs);
++
++      nmi_exit();
++}
++
++void stop_nmi(void)
++{
++      ignore_nmis++;
++}
++
++void restart_nmi(void)
++{
++      ignore_nmis--;
++}
++
++/* May run on IST stack. */
++dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
++{
++#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
++      if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
++                      == NOTIFY_STOP)
++              return;
++#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
++#ifdef CONFIG_KPROBES
++      if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
++                      == NOTIFY_STOP)
++              return;
++#else
++      if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
++                      == NOTIFY_STOP)
++              return;
++#endif
++
++      preempt_conditional_sti(regs);
++      do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
++      preempt_conditional_cli(regs);
++}
++
++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
++/*
++ * Help handler running on IST stack to switch back to user stack
++ * for scheduling or signal handling. The actual stack switch is done in
++ * entry.S
++ */
++asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
++{
++      struct pt_regs *regs = eregs;
++      /* Did already sync */
++      if (eregs == (struct pt_regs *)eregs->sp)
++              ;
++      /* Exception from user space */
++      else if (user_mode(eregs))
++              regs = task_pt_regs(current);
++      /*
++       * Exception from kernel and interrupts are enabled. Move to
++       * kernel process stack.
++       */
++      else if (eregs->flags & X86_EFLAGS_IF)
++              regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
++      if (eregs != regs)
++              *regs = *eregs;
++      return regs;
++}
++#endif
++
++/*
++ * Our handling of the processor debug registers is non-trivial.
++ * We do not clear them on entry and exit from the kernel. Therefore
++ * it is possible to get a watchpoint trap here from inside the kernel.
++ * However, the code in ./ptrace.c has ensured that the user can
++ * only set watchpoints on userspace addresses. Therefore the in-kernel
++ * watchpoint trap can only occur in code which is reading/writing
++ * from user space. Such code must not hold kernel locks (since it
++ * can equally take a page fault), therefore it is safe to call
++ * force_sig_info even though that claims and releases locks.
++ *
++ * Code in ./signal.c ensures that the debug control register
++ * is restored before we deliver any signal, and therefore that
++ * user code runs with the correct debug control register even though
++ * we clear it here.
++ *
++ * Being careful here means that we don't have to be as careful in a
++ * lot of more complicated places (task switching can be a bit lazy
++ * about restoring all the debug state, and ptrace doesn't have to
++ * find every occurrence of the TF bit that could be saved away even
++ * by user code)
++ *
++ * May run on IST stack.
++ */
++dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
++{
++      struct task_struct *tsk = current;
++      int user_icebp = 0;
++      unsigned long dr6;
++      int si_code;
++
++      get_debugreg(dr6, 6);
++
++      /* Filter out all the reserved bits which are preset to 1 */
++      dr6 &= ~DR6_RESERVED;
++
++      /*
++       * If dr6 has no reason to give us about the origin of this trap,
++       * then it's very likely the result of an icebp/int01 trap.
++       * User wants a sigtrap for that.
++       */
++      if (!dr6 && user_mode(regs))
++              user_icebp = 1;
++
++      /* Catch kmemcheck conditions first of all! */
++      if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
++              return;
++
++      /* DR6 may or may not be cleared by the CPU */
++      set_debugreg(0, 6);
++
++      /*
++       * The processor cleared BTF, so don't mark that we need it set.
++       */
++      clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
++
++      /* Store the virtualized DR6 value */
++      tsk->thread.debugreg6 = dr6;
++
++      if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
++                                                      SIGTRAP) == NOTIFY_STOP)
++              return;
++
++      /* It's safe to allow irq's after DR6 has been saved */
++      preempt_conditional_sti(regs);
++
++      if (regs->flags & X86_VM_MASK) {
++              handle_vm86_trap((struct kernel_vm86_regs *) regs,
++                              error_code, 1);
++              preempt_conditional_cli(regs);
++              return;
++      }
++
++      /*
++       * Single-stepping through system calls: ignore any exceptions in
++       * kernel space, but re-enable TF when returning to user mode.
++       *
++       * We already checked v86 mode above, so we can check for kernel mode
++       * by just checking the CPL of CS.
++       */
++      if ((dr6 & DR_STEP) && !user_mode(regs)) {
++              tsk->thread.debugreg6 &= ~DR_STEP;
++              set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
++              regs->flags &= ~X86_EFLAGS_TF;
++      }
++      si_code = get_si_code(tsk->thread.debugreg6);
++      if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
++              send_sigtrap(tsk, regs, error_code, si_code);
++      preempt_conditional_cli(regs);
++
++      return;
++}
++
++/*
++ * Note that we play around with the 'TS' bit in an attempt to get
++ * the correct behaviour even in the presence of the asynchronous
++ * IRQ13 behaviour
++ */
++void math_error(struct pt_regs *regs, int error_code, int trapnr)
++{
++      struct task_struct *task = current;
++      siginfo_t info;
++      unsigned short err;
++      char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
++
++      if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
++              return;
++      conditional_sti(regs);
++
++      if (!user_mode_vm(regs))
++      {
++              if (!fixup_exception(regs)) {
++                      task->thread.error_code = error_code;
++                      task->thread.trap_no = trapnr;
++                      die(str, regs, error_code);
++              }
++              return;
++      }
++
++      /*
++       * Save the info for the exception handler and clear the error.
++       */
++      save_init_fpu(task);
++      task->thread.trap_no = trapnr;
++      task->thread.error_code = error_code;
++      info.si_signo = SIGFPE;
++      info.si_errno = 0;
++      info.si_addr = (void __user *)regs->ip;
++      if (trapnr == 16) {
++              unsigned short cwd, swd;
++              /*
++               * (~cwd & swd) will mask out exceptions that are not set to unmasked
++               * status.  0x3f is the exception bits in these regs, 0x200 is the
++               * C1 reg you need in case of a stack fault, 0x040 is the stack
++               * fault bit.  We should only be taking one exception at a time,
++               * so if this combination doesn't produce any single exception,
++               * then we have a bad program that isn't synchronizing its FPU usage
++               * and it will suffer the consequences since we won't be able to
++               * fully reproduce the context of the exception
++               */
++              cwd = get_fpu_cwd(task);
++              swd = get_fpu_swd(task);
++
++              err = swd & ~cwd;
++      } else {
++              /*
++               * The SIMD FPU exceptions are handled a little differently, as there
++               * is only a single status/control register.  Thus, to determine which
++               * unmasked exception was caught we must mask the exception mask bits
++               * at 0x1f80, and then use these to mask the exception bits at 0x3f.
++               */
++              unsigned short mxcsr = get_fpu_mxcsr(task);
++              err = ~(mxcsr >> 7) & mxcsr;
++      }
++
++      if (err & 0x001) {      /* Invalid op */
++              /*
++               * swd & 0x240 == 0x040: Stack Underflow
++               * swd & 0x240 == 0x240: Stack Overflow
++               * User must clear the SF bit (0x40) if set
++               */
++              info.si_code = FPE_FLTINV;
++      } else if (err & 0x004) { /* Divide by Zero */
++              info.si_code = FPE_FLTDIV;
++      } else if (err & 0x008) { /* Overflow */
++              info.si_code = FPE_FLTOVF;
++      } else if (err & 0x012) { /* Denormal, Underflow */
++              info.si_code = FPE_FLTUND;
++      } else if (err & 0x020) { /* Precision */
++              info.si_code = FPE_FLTRES;
++      } else {
++              /*
++               * If we're using IRQ 13, or supposedly even some trap 16
++               * implementations, it's possible we get a spurious trap...
++               */
++              return;         /* Spurious trap, no error */
++      }
++      force_sig_info(SIGFPE, &info, task);
++}
++
++dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
++{
++#ifdef CONFIG_X86_32
++      ignore_fpu_irq = 1;
++#endif
++
++      math_error(regs, error_code, 16);
++}
++
++dotraplinkage void
++do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
++{
++      math_error(regs, error_code, 19);
++}
++
++#ifndef CONFIG_XEN
++dotraplinkage void
++do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
++{
++      conditional_sti(regs);
++#if 0
++      /* No need to warn about this any longer. */
++      printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
++#endif
++}
++
++asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
++{
++}
++
++asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
++{
++}
++#endif /* CONFIG_XEN */
++
++/*
++ * __math_state_restore assumes that cr0.TS is already clear and the
++ * fpu state is all ready for use.  Used during context switch.
++ */
++void __math_state_restore(void)
++{
++      struct thread_info *thread = current_thread_info();
++      struct task_struct *tsk = thread->task;
++
++      /*
++       * Paranoid restore. send a SIGSEGV if we fail to restore the state.
++       */
++      if (unlikely(restore_fpu_checking(tsk))) {
++              stts();
++              force_sig(SIGSEGV, tsk);
++              return;
++      }
++
++      thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
++      tsk->fpu_counter++;
++}
++
++/*
++ * 'math_state_restore()' saves the current math information in the
++ * old math state array, and gets the new ones from the current task
++ *
++ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
++ * Don't touch unless you *really* know how it works.
++ *
++ * Must be called with kernel preemption disabled (in this case,
++ * local interrupts are disabled at the call-site in entry.S).
++ */
++asmlinkage void math_state_restore(void)
++{
++      struct thread_info *thread = current_thread_info();
++      struct task_struct *tsk = thread->task;
++
++      if (!tsk_used_math(tsk)) {
++              local_irq_enable();
++              /*
++               * does a slab alloc which can sleep
++               */
++              if (init_fpu(tsk)) {
++                      /*
++                       * ran out of memory!
++                       */
++                      do_group_exit(SIGKILL);
++                      return;
++              }
++              local_irq_disable();
++      }
++
++      /* NB. 'clts' is done for us by Xen during virtual trap. */
++      __math_state_restore();
++}
++EXPORT_SYMBOL_GPL(math_state_restore);
++
++dotraplinkage void __kprobes
++do_device_not_available(struct pt_regs *regs, long error_code)
++{
++#ifdef CONFIG_MATH_EMULATION
++      if (read_cr0() & X86_CR0_EM) {
++              struct math_emu_info info = { };
++
++              conditional_sti(regs);
++
++              info.regs = regs;
++              math_emulate(&info);
++              return;
++      }
++#endif
++      math_state_restore(); /* interrupts still off */
++#ifdef CONFIG_X86_32
++      conditional_sti(regs);
++#endif
++}
++
++#ifdef CONFIG_X86_32
++dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
++{
++      siginfo_t info;
++      local_irq_enable();
++
++      info.si_signo = SIGILL;
++      info.si_errno = 0;
++      info.si_code = ILL_BADSTK;
++      info.si_addr = NULL;
++      if (notify_die(DIE_TRAP, "iret exception",
++                      regs, error_code, 32, SIGILL) == NOTIFY_STOP)
++              return;
++      do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
++}
++#endif
++
++/*
++ * NB. All these are "trap gates" (i.e. events_mask isn't set) except
++ * for those that specify <dpl>|4 in the second field.
++ */
++#ifdef CONFIG_X86_32
++#define X 0
++#else
++#define X 4
++#endif
++static const trap_info_t __initconst early_trap_table[] = {
++      {  1, 0|4, __KERNEL_CS, (unsigned long)debug                    },
++      {  3, 3|4, __KERNEL_CS, (unsigned long)int3                     },
++      { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault               },
++      {  0, 0,           0, 0                                         }
++};
++static const trap_info_t __cpuinitconst trap_table[] = {
++      {  0, 0|X, __KERNEL_CS, (unsigned long)divide_error             },
++      {  1, 0|4, __KERNEL_CS, (unsigned long)debug                    },
++      {  3, 3|4, __KERNEL_CS, (unsigned long)int3                     },
++      {  4, 3|X, __KERNEL_CS, (unsigned long)overflow                 },
++      {  5, 0|X, __KERNEL_CS, (unsigned long)bounds                   },
++      {  6, 0|X, __KERNEL_CS, (unsigned long)invalid_op               },
++      {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available     },
++      {  9, 0|X, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
++      { 10, 0|X, __KERNEL_CS, (unsigned long)invalid_TSS              },
++      { 11, 0|X, __KERNEL_CS, (unsigned long)segment_not_present      },
++      { 12, 0|X, __KERNEL_CS, (unsigned long)stack_segment            },
++      { 13, 0|X, __KERNEL_CS, (unsigned long)general_protection       },
++      { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault               },
++      { 16, 0|X, __KERNEL_CS, (unsigned long)coprocessor_error        },
++      { 17, 0|X, __KERNEL_CS, (unsigned long)alignment_check          },
++#ifdef CONFIG_X86_MCE
++      { 18, 0|X, __KERNEL_CS, (unsigned long)machine_check            },
++#endif
++      { 19, 0|X, __KERNEL_CS, (unsigned long)simd_coprocessor_error   },
++#ifdef CONFIG_X86_32
++      { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment          },
++      { SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call   },
++#elif defined(CONFIG_IA32_EMULATION)
++      { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall },
++#endif
++      {  0, 0,           0, 0                                         }
++};
++
++/* Set of traps needed for early debugging. */
++void __init early_trap_init(void)
++{
++      int ret;
++
++      ret = HYPERVISOR_set_trap_table(early_trap_table);
++      if (ret)
++              printk("early set_trap_table failed (%d)\n", ret);
++}
++
++void __init trap_init(void)
++{
++      int ret;
++
++      ret = HYPERVISOR_set_trap_table(trap_table);
++      if (ret)
++              printk("HYPERVISOR_set_trap_table failed (%d)\n", ret);
++
++      /*
++       * Should be a barrier for any external CPU state:
++       */
++      cpu_init();
++
++      x86_init.irqs.trap_init();
++}
++
++void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
++{
++      const trap_info_t *t = trap_table;
++
++      for (t = trap_table; t->address; t++) {
++              trap_ctxt[t->vector].flags = t->flags;
++              trap_ctxt[t->vector].cs = t->cs;
++              trap_ctxt[t->vector].address = t->address;
++      }
++      TI_SET_IF(trap_ctxt + NMI_VECTOR, 1);
++      trap_ctxt[NMI_VECTOR].cs = __KERNEL_CS;
++      trap_ctxt[NMI_VECTOR].address = (unsigned long)nmi;
++}
diff --cc arch/x86/kernel/vm86_32.c

index 863f875,863f875..14a1897
--- 1/arch/x86/kernel/vm86_32.c
--- 2/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@@ -125,7 -125,7 +125,9 @@@ static int copy_vm86_regs_from_user(str
   
   struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
   {
++#ifndef CONFIG_X86_NO_TSS
         struct tss_struct *tss;
++#endif
         struct pt_regs *ret;
         unsigned long tmp;
   
@@@ -148,12 -148,12 +150,16 @@@
                 do_exit(SIGSEGV);
         }
   
++#ifndef CONFIG_X86_NO_TSS
         tss = &per_cpu(init_tss, get_cpu());
++#endif
         current->thread.sp0 = current->thread.saved_sp0;
         current->thread.sysenter_cs = __KERNEL_CS;
         load_sp0(tss, &current->thread);
         current->thread.saved_sp0 = 0;
++#ifndef CONFIG_X86_NO_TSS
         put_cpu();
++#endif
   
         ret = KVM86->regs32;
   
@@@ -280,7 -280,7 +286,9 @@@ out
   
   static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
   {
++#ifndef CONFIG_X86_NO_TSS
         struct tss_struct *tss;
++#endif
   /*
    * make sure the vm86() system call doesn't try to do anything silly
    */
@@@ -324,12 -324,12 +332,16 @@@
         tsk->thread.saved_fs = info->regs32->fs;
         tsk->thread.saved_gs = get_user_gs(info->regs32);
   
++#ifndef CONFIG_X86_NO_TSS
         tss = &per_cpu(init_tss, get_cpu());
++#endif
         tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
         if (cpu_has_sep)
                 tsk->thread.sysenter_cs = 0;
         load_sp0(tss, &tsk->thread);
++#ifndef CONFIG_X86_NO_TSS
         put_cpu();
++#endif
   
         tsk->thread.screen_bitmap = info->screen_bitmap;
         if (info->flags & VM86_SCREEN_BITMAP)
diff --cc arch/x86/kernel/vmlinux.lds.S

index 3a3bff8,624a201..c9b1887
--- 1/arch/x86/kernel/vmlinux.lds.S
--- 2/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@@ -16,8 -16,8 +16,10 @@@
   
   #ifdef CONFIG_X86_32
   #define LOAD_OFFSET __PAGE_OFFSET
--#else
++#elif !defined(CONFIG_XEN) || CONFIG_XEN_COMPAT > 0x030002
   #define LOAD_OFFSET __START_KERNEL_map
++#else
++#define LOAD_OFFSET 0
   #endif
   
   #include <asm-generic/vmlinux.lds.h>
@@@ -41,7 -41,7 +43,7 @@@ ENTRY(phys_startup_64
   jiffies_64 = jiffies;
   #endif
   
--#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
++#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && !defined(CONFIG_XEN)
   /*
    * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
    * we retain large page mappings for boundaries spanning kernel text, rodata
@@@ -84,6 -84,6 +86,10 @@@ SECTION
   {
   #ifdef CONFIG_X86_32
           . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
++#if defined(CONFIG_XEN) && CONFIG_XEN_COMPAT <= 0x030002
++#undef LOAD_OFFSET
++#define LOAD_OFFSET 0
++#endif
           phys_startup_32 = startup_32 - LOAD_OFFSET;
   #else
           . = __START_KERNEL;
diff --cc arch/x86/kernel/vsyscall_64-xen.c

index 0000000,0000000..ad5a2c8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_64-xen.c
@@@ -1,0 -1,0 +1,321 @@@
++/*
++ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
++ *  Copyright 2003 Andi Kleen, SuSE Labs.
++ *
++ *  Thanks to hpa@transmeta.com for some useful hint.
++ *  Special thanks to Ingo Molnar for his early experience with
++ *  a different vsyscall implementation for Linux/IA32 and for the name.
++ *
++ *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
++ *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
++ *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
++ *  jumping out of line if necessary. We cannot add more with this
++ *  mechanism because older kernels won't return -ENOSYS.
++ *  If we want more than four we need a vDSO.
++ *
++ *  Note: the concept clashes with user mode linux. If you use UML and
++ *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
++ */
++
++/* Disable profiling for userspace code: */
++#define DISABLE_BRANCH_PROFILING
++
++#include <linux/time.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/timer.h>
++#include <linux/seqlock.h>
++#include <linux/jiffies.h>
++#include <linux/sysctl.h>
++#include <linux/clocksource.h>
++#include <linux/getcpu.h>
++#include <linux/cpu.h>
++#include <linux/smp.h>
++#include <linux/notifier.h>
++
++#include <asm/vsyscall.h>
++#include <asm/pgtable.h>
++#include <asm/page.h>
++#include <asm/unistd.h>
++#include <asm/fixmap.h>
++#include <asm/errno.h>
++#include <asm/io.h>
++#include <asm/segment.h>
++#include <asm/desc.h>
++#include <asm/topology.h>
++#include <asm/vgtod.h>
++
++#define __vsyscall(nr) \
++              __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
++#define __syscall_clobber "r11","cx","memory"
++
++/*
++ * vsyscall_gtod_data contains data that is :
++ * - readonly from vsyscalls
++ * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
++ * Try to keep this structure as small as possible to avoid cache line ping pongs
++ */
++int __vgetcpu_mode __section_vgetcpu_mode;
++
++struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
++{
++      .lock = SEQLOCK_UNLOCKED,
++      .sysctl_enabled = 1,
++};
++
++void update_vsyscall_tz(void)
++{
++      unsigned long flags;
++
++      write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
++      /* sys_tz has changed */
++      vsyscall_gtod_data.sys_tz = sys_tz;
++      write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
++}
++
++void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
++                      struct clocksource *clock, u32 mult)
++{
++      unsigned long flags;
++
++      write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
++      /* copy vsyscall data */
++      vsyscall_gtod_data.clock.vread = clock->vread;
++      vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
++      vsyscall_gtod_data.clock.mask = clock->mask;
++      vsyscall_gtod_data.clock.mult = mult;
++      vsyscall_gtod_data.clock.shift = clock->shift;
++      vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
++      vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
++      vsyscall_gtod_data.wall_to_monotonic = *wtm;
++      vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
++      write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
++}
++
++/* RED-PEN may want to readd seq locking, but then the variable should be
++ * write-once.
++ */
++static __always_inline void do_get_tz(struct timezone * tz)
++{
++      *tz = __vsyscall_gtod_data.sys_tz;
++}
++
++static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
++{
++      int ret;
++      asm volatile("syscall"
++              : "=a" (ret)
++              : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
++              : __syscall_clobber );
++      return ret;
++}
++
++static __always_inline long time_syscall(long *t)
++{
++      long secs;
++      asm volatile("syscall"
++              : "=a" (secs)
++              : "0" (__NR_time),"D" (t) : __syscall_clobber);
++      return secs;
++}
++
++static __always_inline void do_vgettimeofday(struct timeval * tv)
++{
++      cycle_t now, base, mask, cycle_delta;
++      unsigned seq;
++      unsigned long mult, shift, nsec;
++      cycle_t (*vread)(void);
++      do {
++              seq = read_seqbegin(&__vsyscall_gtod_data.lock);
++
++              vread = __vsyscall_gtod_data.clock.vread;
++              if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
++                      gettimeofday(tv,NULL);
++                      return;
++              }
++
++              now = vread();
++              base = __vsyscall_gtod_data.clock.cycle_last;
++              mask = __vsyscall_gtod_data.clock.mask;
++              mult = __vsyscall_gtod_data.clock.mult;
++              shift = __vsyscall_gtod_data.clock.shift;
++
++              tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
++              nsec = __vsyscall_gtod_data.wall_time_nsec;
++      } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
++
++      /* calculate interval: */
++      cycle_delta = (now - base) & mask;
++      /* convert to nsecs: */
++      nsec += (cycle_delta * mult) >> shift;
++
++      while (nsec >= NSEC_PER_SEC) {
++              tv->tv_sec += 1;
++              nsec -= NSEC_PER_SEC;
++      }
++      tv->tv_usec = nsec / NSEC_PER_USEC;
++}
++
++int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
++{
++      if (tv)
++              do_vgettimeofday(tv);
++      if (tz)
++              do_get_tz(tz);
++      return 0;
++}
++
++/* This will break when the xtime seconds get inaccurate, but that is
++ * unlikely */
++time_t __vsyscall(1) vtime(time_t *t)
++{
++      unsigned seq;
++      time_t result;
++      if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
++              return time_syscall(t);
++
++      do {
++              seq = read_seqbegin(&__vsyscall_gtod_data.lock);
++
++              result = __vsyscall_gtod_data.wall_time_sec;
++
++      } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
++
++      if (t)
++              *t = result;
++      return result;
++}
++
++/* Fast way to get current CPU and node.
++   This helps to do per node and per CPU caches in user space.
++   The result is not guaranteed without CPU affinity, but usually
++   works out because the scheduler tries to keep a thread on the same
++   CPU.
++
++   tcache must point to a two element sized long array.
++   All arguments can be NULL. */
++long __vsyscall(2)
++vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
++{
++      unsigned int p;
++      unsigned long j = 0;
++
++      /* Fast cache - only recompute value once per jiffies and avoid
++         relatively costly rdtscp/cpuid otherwise.
++         This works because the scheduler usually keeps the process
++         on the same CPU and this syscall doesn't guarantee its
++         results anyways.
++         We do this here because otherwise user space would do it on
++         its own in a likely inferior way (no access to jiffies).
++         If you don't like it pass NULL. */
++      if (tcache && tcache->blob[0] == (j = __jiffies)) {
++              p = tcache->blob[1];
++      } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
++              /* Load per CPU data from RDTSCP */
++              native_read_tscp(&p);
++      } else {
++              /* Load per CPU data from GDT */
++              asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
++      }
++      if (tcache) {
++              tcache->blob[0] = j;
++              tcache->blob[1] = p;
++      }
++      if (cpu)
++              *cpu = p & 0xfff;
++      if (node)
++              *node = p >> 12;
++      return 0;
++}
++
++static long __vsyscall(3) venosys_1(void)
++{
++      return -ENOSYS;
++}
++
++#ifdef CONFIG_SYSCTL
++static ctl_table kernel_table2[] = {
++      { .procname = "vsyscall64",
++        .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
++        .mode = 0644,
++        .proc_handler = proc_dointvec },
++      {}
++};
++
++static ctl_table kernel_root_table2[] = {
++      { .procname = "kernel", .mode = 0555,
++        .child = kernel_table2 },
++      {}
++};
++#endif
++
++/* Assume __initcall executes before all user space. Hopefully kmod
++   doesn't violate that. We'll find out if it does. */
++static void __cpuinit vsyscall_set_cpu(int cpu)
++{
++      unsigned long d;
++      unsigned long node = 0;
++#ifdef CONFIG_NUMA
++      node = cpu_to_node(cpu);
++#endif
++      if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
++              write_rdtscp_aux((node << 12) | cpu);
++
++      /* Store cpu number in limit so that it can be loaded quickly
++         in user space in vgetcpu.
++         12 bits for the CPU and 8 bits for the node. */
++      d = 0x0f40000000000ULL;
++      d |= cpu;
++      d |= (node & 0xf) << 12;
++      d |= (node >> 4) << 48;
++      write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
++}
++
++static void __cpuinit cpu_vsyscall_init(void *arg)
++{
++      /* preemption should be already off */
++      vsyscall_set_cpu(raw_smp_processor_id());
++}
++
++static int __cpuinit
++cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
++{
++      long cpu = (long)arg;
++      if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
++              smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
++      return NOTIFY_DONE;
++}
++
++void __init map_vsyscall(void)
++{
++      extern char __vsyscall_0;
++      unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
++
++      /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
++      __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
++}
++
++static int __init vsyscall_init(void)
++{
++      BUG_ON(((unsigned long) &vgettimeofday !=
++                      VSYSCALL_ADDR(__NR_vgettimeofday)));
++      BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
++      BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
++      BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
++#ifdef CONFIG_XEN
++      vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */
++      if (boot_cpu_has(X86_FEATURE_RDTSCP))
++              vgetcpu_mode = VGETCPU_RDTSCP;
++      else
++              vgetcpu_mode = VGETCPU_LSL;
++#endif
++#ifdef CONFIG_SYSCTL
++      register_sysctl_table(kernel_root_table2);
++#endif
++      on_each_cpu(cpu_vsyscall_init, NULL, 1);
++      /* notifier priority > KVM */
++      hotcpu_notifier(cpu_vsyscall_notifier, 30);
++      return 0;
++}
++
++__initcall(vsyscall_init);
diff --cc arch/x86/kernel/x8664_ksyms_64.c

index 9796c2f,9796c2f..4d10890
--- 1/arch/x86/kernel/x8664_ksyms_64.c
--- 2/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@@ -55,6 -55,6 +55,6 @@@ EXPORT_SYMBOL(__memcpy)
   EXPORT_SYMBOL(memmove);
   
   EXPORT_SYMBOL(empty_zero_page);
--#ifndef CONFIG_PARAVIRT
++#if !defined(CONFIG_PARAVIRT) && !defined(CONFIG_XEN)
   EXPORT_SYMBOL(native_load_gs_index);
   #endif
diff --cc arch/x86/kernel/x86_init-xen.c

index 0000000,0000000..369d3f5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/kernel/x86_init-xen.c
@@@ -1,0 -1,0 +1,99 @@@
++/*
++ * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
++ *
++ *  For licencing details see kernel-base/COPYING
++ */
++#include <linux/bitmap.h>
++#include <linux/init.h>
++#include <linux/ioport.h>
++#include <linux/list.h>
++#include <linux/module.h>
++#include <linux/spinlock_types.h>
++#include <linux/threads.h>
++
++#include <asm/pci_x86.h>
++#include <asm/mpspec.h>
++#include <asm/setup.h>
++#include <asm/apic.h>
++#include <asm/e820.h>
++#include <asm/time.h>
++#include <asm/irq.h>
++#include <asm/pat.h>
++#include <asm/iommu.h>
++
++void __cpuinit x86_init_noop(void) { }
++void __init x86_init_uint_noop(unsigned int unused) { }
++void __init x86_init_pgd_noop(pgd_t *unused) { }
++int __init iommu_init_noop(void) { return 0; }
++
++/*
++ * The platform setup functions are preset with the default functions
++ * for standard PC hardware.
++ */
++struct x86_init_ops x86_init __initdata = {
++
++      .resources = {
++              .probe_roms             = x86_init_noop,
++              .reserve_resources      = reserve_standard_io_resources,
++              .memory_setup           = default_machine_specific_memory_setup,
++      },
++
++      .mpparse = {
++              .mpc_record             = x86_init_uint_noop,
++              .setup_ioapic_ids       = x86_init_noop,
++              .mpc_apic_id            = NULL,
++              .smp_read_mpc_oem       = default_smp_read_mpc_oem,
++              .mpc_oem_bus_info       = default_mpc_oem_bus_info,
++              .find_smp_config        = default_find_smp_config,
++              .get_smp_config         = default_get_smp_config,
++      },
++
++      .irqs = {
++              .pre_vector_init        = NULL,
++              .intr_init              = NULL,
++              .trap_init              = x86_init_noop,
++      },
++
++      .oem = {
++              .arch_setup             = xen_arch_setup,
++              .banner                 = x86_init_noop,
++      },
++
++      .mapping = {
++              .pagetable_reserve              = xen_pagetable_reserve,
++      },
++
++      .paging = {
++              .pagetable_setup_start  = x86_init_pgd_noop,
++              .pagetable_setup_done   = x86_init_pgd_noop,
++      },
++
++      .timers = {
++              .setup_percpu_clockev   = NULL,
++              .tsc_pre_init           = x86_init_noop,
++              .timer_init             = x86_init_noop,
++              .wallclock_init         = x86_init_noop,
++      },
++
++      .iommu = {
++              .iommu_init             = iommu_init_noop,
++      },
++
++      .pci = {
++              .init                   = x86_default_pci_init,
++              .init_irq               = x86_default_pci_init_irq,
++              .fixup_irqs             = x86_default_pci_fixup_irqs,
++      },
++};
++
++static int default_i8042_detect(void) { return 1; };
++
++struct x86_platform_ops x86_platform = {
++      .calibrate_tsc                  = NULL,
++      .get_wallclock                  = mach_get_cmos_time,
++      .set_wallclock                  = mach_set_rtc_mmss,
++      .is_untracked_pat_range         = is_ISA_range,
++      .i8042_detect                   = default_i8042_detect
++};
++
++EXPORT_SYMBOL_GPL(x86_platform);
diff --cc arch/x86/kvm/Kconfig

index 50f6364,50f6364..975051d
--- 1/arch/x86/kvm/Kconfig
--- 2/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@@ -7,6 -7,6 +7,7 @@@ source "virt/kvm/Kconfig
   menuconfig VIRTUALIZATION
         bool "Virtualization"
         depends on HAVE_KVM || X86
++      depends on !XEN
         default y
         ---help---
           Say Y here to get to see options for using your Linux host to run other
diff --cc arch/x86/kvm/svm.c
Simple merge
diff --cc arch/x86/kvm/x86.c
Simple merge
diff --cc arch/x86/lib/Makefile

index f2479f1,f2479f1..64a8d7d
--- 1/arch/x86/lib/Makefile
--- 2/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@@ -15,6 -15,6 +15,7 @@@ $(obj)/inat.o: $(obj)/inat-tables.
   clean-files := inat-tables.c
   
   obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
++obj-$(CONFIG_XEN) += cache-smp.o
   
   lib-y := delay.o
   lib-y += thunk_$(BITS).o
@@@ -44,3 -44,3 +45,5 @@@ els
         lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
         lib-y += cmpxchg16b_emu.o
   endif
++
++lib-$(CONFIG_XEN_SCRUB_PAGES) += scrub.o
diff --cc arch/x86/lib/cache-smp-xen.c

index 0000000,0000000..48bfd37

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/lib/cache-smp-xen.c
@@@ -1,0 -1,0 +1,27 @@@
++#include <linux/smp.h>
++#include <linux/module.h>
++#include <asm/hypervisor.h>
++
++static void __wbinvd(void *dummy)
++{
++      wbinvd();
++}
++
++#ifndef CONFIG_XEN
++void wbinvd_on_cpu(int cpu)
++{
++      smp_call_function_single(cpu, __wbinvd, NULL, 1);
++}
++EXPORT_SYMBOL(wbinvd_on_cpu);
++#endif
++
++int wbinvd_on_all_cpus(void)
++{
++      struct mmuext_op op = { .cmd = MMUEXT_FLUSH_CACHE_GLOBAL };
++
++      if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) == 0)
++              return 0;
++      /* Best effort as fallback. */
++      return on_each_cpu(__wbinvd, NULL, 1);
++}
++EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --cc arch/x86/lib/scrub.c

index 0000000,0000000..f333ae8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/lib/scrub.c
@@@ -1,0 -1,0 +1,21 @@@
++#include <asm/cpufeature.h>
++#include <asm/page.h>
++#include <asm/processor.h>
++
++void scrub_pages(void *v, unsigned int count)
++{
++      if (likely(cpu_has_xmm2)) {
++              unsigned long n = count * (PAGE_SIZE / sizeof(long) / 4);
++
++              for (; n--; v += sizeof(long) * 4)
++                      asm("movnti %1,(%0)\n\t"
++                          "movnti %1,%c2(%0)\n\t"
++                          "movnti %1,2*%c2(%0)\n\t"
++                          "movnti %1,3*%c2(%0)\n\t"
++                          : : "r" (v), "r" (0L), "i" (sizeof(long))
++                          : "memory");
++              asm volatile("sfence" : : : "memory");
++      } else
++              for (; count--; v += PAGE_SIZE)
++                      clear_page(v);
++}
diff --cc arch/x86/mm/Makefile

index 3d11327,3e608ed..ec16d84
--- 1/arch/x86/mm/Makefile
--- 2/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@@ -23,10 -23,10 +23,13 @@@ mmiotrace-y                        := kmmio.o pf_in.o mmio-m
   obj-$(CONFIG_MMIOTRACE_TEST)  += testmmiotrace.o
   
   obj-$(CONFIG_NUMA)            += numa.o numa_$(BITS).o
- obj-$(CONFIG_AMD_NUMA)                += amdtopology.o
- obj-$(CONFIG_ACPI_NUMA)               += srat.o
+ obj-$(CONFIG_AMD_NUMA)                += amdtopology_64.o
+ obj-$(CONFIG_ACPI_NUMA)               += srat_$(BITS).o
   obj-$(CONFIG_NUMA_EMU)                += numa_emulation.o
   
++obj-$(CONFIG_XEN)             += hypervisor.o
++disabled-obj-$(CONFIG_XEN)    := gup.o tlb.o
++
   obj-$(CONFIG_HAVE_MEMBLOCK)           += memblock.o
   
   obj-$(CONFIG_MEMTEST)         += memtest.o
diff --cc arch/x86/mm/dump_pagetables-xen.c

index 0000000,0000000..d352692

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/dump_pagetables-xen.c
@@@ -1,0 -1,0 +1,392 @@@
++/*
++ * Debug helper to dump the current kernel pagetables of the system
++ * so that we can see what the various memory ranges are set to.
++ *
++ * (C) Copyright 2008 Intel Corporation
++ *
++ * Author: Arjan van de Ven <arjan@linux.intel.com>
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; version 2
++ * of the License.
++ */
++
++#include <linux/debugfs.h>
++#include <linux/init.h>
++#include <linux/mm.h>
++#include <linux/module.h>
++#include <linux/seq_file.h>
++
++#include <xen/interface/xen.h>
++
++#include <asm/pgtable.h>
++
++/*
++ * The dumper groups pagetable entries of the same type into one, and for
++ * that it needs to keep some state when walking, and flush this state
++ * when a "break" in the continuity is found.
++ */
++struct pg_state {
++      int level;
++      pgprot_t current_prot;
++      unsigned long start_address;
++      unsigned long current_address;
++      const struct addr_marker *marker;
++};
++
++struct addr_marker {
++      unsigned long start_address;
++      const char *name;
++};
++
++/* indices for address_markers; keep sync'd w/ address_markers below */
++enum address_markers_idx {
++      USER_SPACE_NR = 0,
++#ifdef CONFIG_X86_64
++      XEN_SPACE_NR,
++      LOW_KERNEL_NR,
++      VMALLOC_START_NR,
++      VMEMMAP_START_NR,
++      HIGH_KERNEL_NR,
++      MODULES_VADDR_NR,
++      MODULES_END_NR,
++#else
++      KERNEL_SPACE_NR,
++      VMALLOC_START_NR,
++      VMALLOC_END_NR,
++# ifdef CONFIG_HIGHMEM
++      PKMAP_BASE_NR,
++# endif
++      FIXADDR_START_NR,
++      XEN_SPACE_NR,
++#endif
++};
++
++/* Address space markers hints */
++static struct addr_marker address_markers[] = {
++      { 0, "User Space" },
++#ifdef CONFIG_X86_64
++      { HYPERVISOR_VIRT_START,      "Hypervisor Space" },
++      { PAGE_OFFSET,                "Low Kernel Mapping" },
++      { VMALLOC_START,              "vmalloc() Area" },
++      { VMEMMAP_START,              "Vmemmap" },
++      { __START_KERNEL_map,         "High Kernel Mapping" },
++      { MODULES_VADDR,              "Modules" },
++      { MODULES_END,                "End Modules" },
++#else
++      { PAGE_OFFSET,                "Kernel Mapping" },
++      { 0/* VMALLOC_START */,       "vmalloc() Area" },
++      { 0/*VMALLOC_END*/,           "vmalloc() End" },
++# ifdef CONFIG_HIGHMEM
++      { 0/*PKMAP_BASE*/,            "Persisent kmap() Area" },
++# endif
++      { 0/*FIXADDR_START*/,         "Fixmap Area" },
++      { 0/*HYPERVISOR_VIRT_START*/, "Hypervisor Space" },
++#endif
++      { -1, NULL }                  /* End of list */
++};
++
++static inline bool hypervisor_space(unsigned long addr) {
++#ifdef CONFIG_X86_64
++      return addr >= HYPERVISOR_VIRT_START && addr < HYPERVISOR_VIRT_END;
++#else
++      return addr >= hypervisor_virt_start;
++#endif
++}
++
++/* Multipliers for offsets within the PTEs */
++#define PTE_LEVEL_MULT (PAGE_SIZE)
++#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
++#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
++#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
++
++/*
++ * Print a readable form of a pgprot_t to the seq_file
++ */
++static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
++{
++      pgprotval_t pr = pgprot_val(prot);
++      static const char * const level_name[] =
++              { "cr3", "pgd", "pud", "pmd", "pte" };
++
++      if (!pgprot_val(prot)) {
++              /* Not present */
++              seq_printf(m, "                          ");
++      } else {
++              if (pr & _PAGE_USER)
++                      seq_printf(m, "USR ");
++              else
++                      seq_printf(m, "    ");
++              if (pr & _PAGE_RW)
++                      seq_printf(m, "RW ");
++              else
++                      seq_printf(m, "ro ");
++              if (pr & _PAGE_PWT)
++                      seq_printf(m, "PWT ");
++              else
++                      seq_printf(m, "    ");
++              if (pr & _PAGE_PCD)
++                      seq_printf(m, "PCD ");
++              else
++                      seq_printf(m, "    ");
++
++              /* Bit 9 has a different meaning on level 3 vs 4 */
++              if (level <= 3) {
++                      if (pr & _PAGE_PSE)
++                              seq_printf(m, "PSE ");
++                      else
++                              seq_printf(m, "    ");
++              } else {
++                      if (pr & _PAGE_PAT)
++                              seq_printf(m, "pat ");
++                      else
++                              seq_printf(m, "    ");
++              }
++              if (pr & _PAGE_GLOBAL)
++                      seq_printf(m, "GLB ");
++              else
++                      seq_printf(m, "    ");
++              if (pr & _PAGE_NX)
++                      seq_printf(m, "NX ");
++              else
++                      seq_printf(m, "x  ");
++      }
++      seq_printf(m, "%s\n", level_name[level]);
++}
++
++/*
++ * On 64 bits, sign-extend the 48 bit address to 64 bit
++ */
++static unsigned long normalize_addr(unsigned long u)
++{
++#ifdef CONFIG_X86_64
++      return (signed long)(u << 16) >> 16;
++#else
++      return u;
++#endif
++}
++
++/*
++ * This function gets called on a break in a continuous series
++ * of PTE entries; the next one is different so we need to
++ * print what we collected so far.
++ */
++static void note_page(struct seq_file *m, struct pg_state *st,
++                    pgprot_t new_prot, int level)
++{
++      pgprotval_t prot, cur;
++      static const char units[] = "KMGTPE";
++
++      /*
++       * If we have a "break" in the series, we need to flush the state that
++       * we have now. "break" is either changing perms, levels or
++       * address space marker.
++       */
++      prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
++      cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
++
++      if (!st->level) {
++              /* First entry */
++              st->current_prot = new_prot;
++              st->level = level;
++              st->marker = address_markers;
++              seq_printf(m, "---[ %s ]---\n", st->marker->name);
++      } else if (prot != cur || level != st->level ||
++                 st->current_address >= st->marker[1].start_address) {
++              const char *unit = units;
++              unsigned long delta;
++              int width = sizeof(unsigned long) * 2;
++
++              /*
++               * Now print the actual finished series
++               */
++              seq_printf(m, "0x%0*lx-0x%0*lx   ",
++                         width, st->start_address,
++                         width, st->current_address);
++
++              delta = (st->current_address - st->start_address) >> 10;
++              while (!(delta & 1023) && unit[1]) {
++                      delta >>= 10;
++                      unit++;
++              }
++              seq_printf(m, "%9lu%c ", delta, *unit);
++              printk_prot(m, st->current_prot, st->level);
++
++              /*
++               * We print markers for special areas of address space,
++               * such as the start of vmalloc space etc.
++               * This helps in the interpretation.
++               */
++              if (st->current_address >= st->marker[1].start_address) {
++                      st->marker++;
++                      seq_printf(m, "---[ %s ]---\n", st->marker->name);
++              }
++
++              st->start_address = st->current_address;
++              st->current_prot = new_prot;
++              st->level = level;
++      }
++}
++
++static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
++                                                      unsigned long P)
++{
++      int i;
++      pte_t *start;
++
++      start = (pte_t *) pmd_page_vaddr(addr);
++      for (i = 0; i < PTRS_PER_PTE; i++) {
++              pgprot_t prot = pte_pgprot(*start);
++
++              st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
++              note_page(m, st, prot, 4);
++              start++;
++      }
++}
++
++#if PTRS_PER_PMD > 1
++
++static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
++                                                      unsigned long P)
++{
++      int i;
++      pmd_t *start;
++
++      start = (pmd_t *) pud_page_vaddr(addr);
++      for (i = 0; i < PTRS_PER_PMD; i++) {
++              st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
++              if (!hypervisor_space(st->current_address)
++                  && !pmd_none(*start)) {
++                      pgprotval_t prot = __pmd_val(*start) & PTE_FLAGS_MASK;
++
++                      if (pmd_large(*start) || !pmd_present(*start))
++                              note_page(m, st, __pgprot(prot), 3);
++                      else
++                              walk_pte_level(m, st, *start,
++                                             P + i * PMD_LEVEL_MULT);
++              } else
++                      note_page(m, st, __pgprot(0), 3);
++              start++;
++      }
++}
++
++#else
++#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
++#define pud_large(a) pmd_large(__pmd(pud_val(a)))
++#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
++#endif
++
++#if PTRS_PER_PUD > 1
++
++static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
++                                                      unsigned long P)
++{
++      int i;
++      pud_t *start;
++
++      start = (pud_t *) pgd_page_vaddr(addr);
++
++      for (i = 0; i < PTRS_PER_PUD; i++) {
++              st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
++              if (!hypervisor_space(st->current_address)
++                  && !pud_none(*start)) {
++                      pgprotval_t prot = __pud_val(*start) & PTE_FLAGS_MASK;
++
++                      if (pud_large(*start) || !pud_present(*start))
++                              note_page(m, st, __pgprot(prot), 2);
++                      else
++                              walk_pmd_level(m, st, *start,
++                                             P + i * PUD_LEVEL_MULT);
++              } else
++                      note_page(m, st, __pgprot(0), 2);
++
++              start++;
++      }
++}
++
++#else
++#define __pud_ma(x) ((pud_t){ __pgd_ma(x) })
++#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud_ma(__pgd_val(a)),p)
++#define pgd_large(a) pud_large(__pud_ma(__pgd_val(a)))
++#define pgd_none(a)  pud_none(__pud_ma(__pgd_val(a)))
++#endif
++
++static void walk_pgd_level(struct seq_file *m)
++{
++#ifdef CONFIG_X86_64
++      pgd_t *start = (pgd_t *) &init_level4_pgt;
++#else
++      pgd_t *start = swapper_pg_dir;
++#endif
++      int i;
++      struct pg_state st;
++
++      memset(&st, 0, sizeof(st));
++
++      for (i = 0; i < PTRS_PER_PGD; i++) {
++              st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
++              if (!pgd_none(*start)) {
++                      pgprotval_t prot = __pgd_val(*start) & PTE_FLAGS_MASK;
++
++                      if (pgd_large(*start) || !pgd_present(*start))
++                              note_page(m, &st, __pgprot(prot), 1);
++                      else
++                              walk_pud_level(m, &st, *start,
++                                             i * PGD_LEVEL_MULT);
++              } else
++                      note_page(m, &st, __pgprot(0), 1);
++
++              start++;
++      }
++
++      /* Flush out the last page */
++      st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
++      note_page(m, &st, __pgprot(0), 0);
++}
++
++static int ptdump_show(struct seq_file *m, void *v)
++{
++      walk_pgd_level(m);
++      return 0;
++}
++
++static int ptdump_open(struct inode *inode, struct file *filp)
++{
++      return single_open(filp, ptdump_show, NULL);
++}
++
++static const struct file_operations ptdump_fops = {
++      .open           = ptdump_open,
++      .read           = seq_read,
++      .llseek         = seq_lseek,
++      .release        = single_release,
++};
++
++static int __init pt_dump_init(void)
++{
++      struct dentry *pe;
++
++#ifdef CONFIG_X86_32
++      /* Not a compile-time constant on x86-32 */
++      address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
++      address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
++# ifdef CONFIG_HIGHMEM
++      address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
++# endif
++      address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
++      address_markers[XEN_SPACE_NR].start_address = hypervisor_virt_start;
++#endif
++
++      pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
++                               &ptdump_fops);
++      if (!pe)
++              return -ENOMEM;
++
++      return 0;
++}
++
++__initcall(pt_dump_init);
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
++MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --cc arch/x86/mm/fault-xen.c

index 0000000,0000000..0088e43

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/fault-xen.c
@@@ -1,0 -1,0 +1,1195 @@@
++/*
++ *  Copyright (C) 1995  Linus Torvalds
++ *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
++ *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
++ */
++#include <linux/magic.h>              /* STACK_END_MAGIC              */
++#include <linux/sched.h>              /* test_thread_flag(), ...      */
++#include <linux/kdebug.h>             /* oops_begin/end, ...          */
++#include <linux/module.h>             /* search_exception_table       */
++#include <linux/bootmem.h>            /* max_low_pfn                  */
++#include <linux/kprobes.h>            /* __kprobes, ...               */
++#include <linux/mmiotrace.h>          /* kmmio_handler, ...           */
++#include <linux/perf_event.h>         /* perf_sw_event                */
++#include <linux/hugetlb.h>            /* hstate_index_to_shift        */
++
++#include <asm/traps.h>                        /* dotraplinkage, ...           */
++#include <asm/pgalloc.h>              /* pgd_*(), ...                 */
++#include <asm/kmemcheck.h>            /* kmemcheck_*(), ...           */
++
++/*
++ * Page fault error code bits:
++ *
++ *   bit 0 ==  0: no page found       1: protection fault
++ *   bit 1 ==  0: read access         1: write access
++ *   bit 2 ==  0: kernel-mode access  1: user-mode access
++ *   bit 3 ==                         1: use of reserved bit detected
++ *   bit 4 ==                         1: fault was an instruction fetch
++ */
++enum x86_pf_error_code {
++
++      PF_PROT         =               1 << 0,
++      PF_WRITE        =               1 << 1,
++      PF_USER         =               1 << 2,
++      PF_RSVD         =               1 << 3,
++      PF_INSTR        =               1 << 4,
++};
++
++/*
++ * Returns 0 if mmiotrace is disabled, or if the fault is not
++ * handled by mmiotrace:
++ */
++static inline int __kprobes
++kmmio_fault(struct pt_regs *regs, unsigned long addr)
++{
++      if (unlikely(is_kmmio_active()))
++              if (kmmio_handler(regs, addr) == 1)
++                      return -1;
++      return 0;
++}
++
++static inline int __kprobes notify_page_fault(struct pt_regs *regs)
++{
++      int ret = 0;
++
++      /* kprobe_running() needs smp_processor_id() */
++      if (kprobes_built_in() && !user_mode_vm(regs)) {
++              preempt_disable();
++              if (kprobe_running() && kprobe_fault_handler(regs, 14))
++                      ret = 1;
++              preempt_enable();
++      }
++
++      return ret;
++}
++
++/*
++ * Prefetch quirks:
++ *
++ * 32-bit mode:
++ *
++ *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
++ *   Check that here and ignore it.
++ *
++ * 64-bit mode:
++ *
++ *   Sometimes the CPU reports invalid exceptions on prefetch.
++ *   Check that here and ignore it.
++ *
++ * Opcode checker based on code by Richard Brunner.
++ */
++static inline int
++check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
++                    unsigned char opcode, int *prefetch)
++{
++      unsigned char instr_hi = opcode & 0xf0;
++      unsigned char instr_lo = opcode & 0x0f;
++
++      switch (instr_hi) {
++      case 0x20:
++      case 0x30:
++              /*
++               * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
++               * In X86_64 long mode, the CPU will signal invalid
++               * opcode if some of these prefixes are present so
++               * X86_64 will never get here anyway
++               */
++              return ((instr_lo & 7) == 0x6);
++#ifdef CONFIG_X86_64
++      case 0x40:
++              /*
++               * In AMD64 long mode 0x40..0x4F are valid REX prefixes
++               * Need to figure out under what instruction mode the
++               * instruction was issued. Could check the LDT for lm,
++               * but for now it's good enough to assume that long
++               * mode only uses well known segments or kernel.
++               */
++              return (!user_mode(regs)) || (regs->cs == __USER_CS);
++#endif
++      case 0x60:
++              /* 0x64 thru 0x67 are valid prefixes in all modes. */
++              return (instr_lo & 0xC) == 0x4;
++      case 0xF0:
++              /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
++              return !instr_lo || (instr_lo>>1) == 1;
++      case 0x00:
++              /* Prefetch instruction is 0x0F0D or 0x0F18 */
++              if (probe_kernel_address(instr, opcode))
++                      return 0;
++
++              *prefetch = (instr_lo == 0xF) &&
++                      (opcode == 0x0D || opcode == 0x18);
++              return 0;
++      default:
++              return 0;
++      }
++}
++
++static int
++is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
++{
++      unsigned char *max_instr;
++      unsigned char *instr;
++      int prefetch = 0;
++
++      /*
++       * If it was a exec (instruction fetch) fault on NX page, then
++       * do not ignore the fault:
++       */
++      if (error_code & PF_INSTR)
++              return 0;
++
++      instr = (void *)convert_ip_to_linear(current, regs);
++      max_instr = instr + 15;
++
++      if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
++              return 0;
++
++      while (instr < max_instr) {
++              unsigned char opcode;
++
++              if (probe_kernel_address(instr, opcode))
++                      break;
++
++              instr++;
++
++              if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
++                      break;
++      }
++      return prefetch;
++}
++
++static void
++force_sig_info_fault(int si_signo, int si_code, unsigned long address,
++                   struct task_struct *tsk, int fault)
++{
++      unsigned lsb = 0;
++      siginfo_t info;
++
++      info.si_signo   = si_signo;
++      info.si_errno   = 0;
++      info.si_code    = si_code;
++      info.si_addr    = (void __user *)address;
++      if (fault & VM_FAULT_HWPOISON_LARGE)
++              lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
++      if (fault & VM_FAULT_HWPOISON)
++              lsb = PAGE_SHIFT;
++      info.si_addr_lsb = lsb;
++
++      force_sig_info(si_signo, &info, tsk);
++}
++
++DEFINE_SPINLOCK(pgd_lock);
++LIST_HEAD(pgd_list);
++
++#ifdef CONFIG_X86_32
++static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
++{
++      unsigned index = pgd_index(address);
++      pgd_t *pgd_k;
++      pud_t *pud, *pud_k;
++      pmd_t *pmd, *pmd_k;
++
++      pgd += index;
++      pgd_k = init_mm.pgd + index;
++
++      if (!pgd_present(*pgd_k))
++              return NULL;
++
++      /*
++       * set_pgd(pgd, *pgd_k); here would be useless on PAE
++       * and redundant with the set_pmd() on non-PAE. As would
++       * set_pud.
++       */
++      pud = pud_offset(pgd, address);
++      pud_k = pud_offset(pgd_k, address);
++      if (!pud_present(*pud_k))
++              return NULL;
++
++      pmd = pmd_offset(pud, address);
++      pmd_k = pmd_offset(pud_k, address);
++      if (!pmd_present(*pmd_k))
++              return NULL;
++
++      if (!pmd_present(*pmd))
++#if CONFIG_XEN_COMPAT > 0x030002
++              set_pmd(pmd, *pmd_k);
++#else
++              /*
++               * When running on older Xen we must launder *pmd_k through
++               * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
++               */
++              set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
++#endif
++      else
++              BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
++
++      return pmd_k;
++}
++
++void vmalloc_sync_all(void)
++{
++      unsigned long address;
++
++      if (SHARED_KERNEL_PMD)
++              return;
++
++      for (address = VMALLOC_START & PMD_MASK;
++           address >= TASK_SIZE && address < FIXADDR_TOP;
++           address += PMD_SIZE) {
++              struct page *page;
++
++              spin_lock(&pgd_lock);
++              list_for_each_entry(page, &pgd_list, lru) {
++                      spinlock_t *pgt_lock;
++                      pmd_t *ret;
++
++                      /* the pgt_lock only for Xen */
++                      pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
++
++                      spin_lock(pgt_lock);
++                      ret = vmalloc_sync_one(page_address(page), address);
++                      spin_unlock(pgt_lock);
++
++                      if (!ret)
++                              break;
++              }
++              spin_unlock(&pgd_lock);
++      }
++}
++
++/*
++ * 32-bit:
++ *
++ *   Handle a fault on the vmalloc or module mapping area
++ */
++static noinline __kprobes int vmalloc_fault(unsigned long address)
++{
++      unsigned long pgd_paddr;
++      pmd_t *pmd_k;
++      pte_t *pte_k;
++
++      /* Make sure we are in vmalloc area: */
++      if (!(address >= VMALLOC_START && address < VMALLOC_END))
++              return -1;
++
++      WARN_ON_ONCE(in_nmi());
++
++      /*
++       * Synchronize this task's top level page-table
++       * with the 'reference' page table.
++       *
++       * Do _not_ use "current" here. We might be inside
++       * an interrupt in the middle of a task switch..
++       */
++      pgd_paddr = read_cr3();
++      pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
++      if (!pmd_k)
++              return -1;
++
++      pte_k = pte_offset_kernel(pmd_k, address);
++      if (!pte_present(*pte_k))
++              return -1;
++
++      return 0;
++}
++
++/*
++ * Did it hit the DOS screen memory VA from vm86 mode?
++ */
++static inline void
++check_v8086_mode(struct pt_regs *regs, unsigned long address,
++               struct task_struct *tsk)
++{
++      unsigned long bit;
++
++      if (!v8086_mode(regs))
++              return;
++
++      bit = (address - 0xA0000) >> PAGE_SHIFT;
++      if (bit < 32)
++              tsk->thread.screen_bitmap |= 1 << bit;
++}
++
++static bool low_pfn(unsigned long pfn)
++{
++      return pfn < max_low_pfn;
++}
++
++static void dump_pagetable(unsigned long address)
++{
++      pgd_t *base = __va(read_cr3());
++      pgd_t *pgd = &base[pgd_index(address)];
++      pmd_t *pmd;
++      pte_t *pte;
++
++#ifdef CONFIG_X86_PAE
++      printk("*pdpt = %016Lx ", __pgd_val(*pgd));
++      if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
++              goto out;
++#endif
++      pmd = pmd_offset(pud_offset(pgd, address), address);
++      printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)__pmd_val(*pmd));
++
++      /*
++       * We must not directly access the pte in the highpte
++       * case if the page table is located in highmem.
++       * And let's rather not kmap-atomic the pte, just in case
++       * it's allocated already:
++       */
++      if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
++              goto out;
++
++      pte = pte_offset_kernel(pmd, address);
++      printk(KERN_CONT "*pte = %0*Lx ", sizeof(*pte) * 2, (u64)__pte_val(*pte));
++out:
++      printk(KERN_CONT "\n");
++}
++#define dump_pagetable(addr, krnl) dump_pagetable(addr)
++
++#else /* CONFIG_X86_64: */
++
++void vmalloc_sync_all(void)
++{
++      sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
++}
++
++/*
++ * 64-bit:
++ *
++ *   Handle a fault on the vmalloc area
++ *
++ * This assumes no large pages in there.
++ */
++static noinline __kprobes int vmalloc_fault(unsigned long address)
++{
++      pgd_t *pgd, *pgd_ref;
++      pud_t *pud, *pud_ref;
++      pmd_t *pmd, *pmd_ref;
++      pte_t *pte, *pte_ref;
++
++      /* Make sure we are in vmalloc area: */
++      if (!(address >= VMALLOC_START && address < VMALLOC_END))
++              return -1;
++
++      WARN_ON_ONCE(in_nmi());
++
++      /*
++       * Copy kernel mappings over when needed. This can also
++       * happen within a race in page table update. In the later
++       * case just flush:
++       */
++      pgd = pgd_offset(current->active_mm, address);
++      pgd_ref = pgd_offset_k(address);
++      if (pgd_none(*pgd_ref))
++              return -1;
++
++      if (pgd_none(*pgd))
++              set_pgd(pgd, *pgd_ref);
++      else
++              BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
++
++      /*
++       * Below here mismatches are bugs because these lower tables
++       * are shared:
++       */
++
++      pud = pud_offset(pgd, address);
++      pud_ref = pud_offset(pgd_ref, address);
++      if (pud_none(*pud_ref))
++              return -1;
++
++      if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
++              BUG();
++
++      pmd = pmd_offset(pud, address);
++      pmd_ref = pmd_offset(pud_ref, address);
++      if (pmd_none(*pmd_ref))
++              return -1;
++
++      if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
++              BUG();
++
++      pte_ref = pte_offset_kernel(pmd_ref, address);
++      if (!pte_present(*pte_ref))
++              return -1;
++
++      pte = pte_offset_kernel(pmd, address);
++
++      /*
++       * Don't use pte_page here, because the mappings can point
++       * outside mem_map, and the NUMA hash lookup cannot handle
++       * that:
++       */
++      if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
++              BUG();
++
++      return 0;
++}
++
++static const char errata93_warning[] =
++KERN_ERR
++"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
++"******* Working around it, but it may cause SEGVs or burn power.\n"
++"******* Please consider a BIOS update.\n"
++"******* Disabling USB legacy in the BIOS may also help.\n";
++
++/*
++ * No vm86 mode in 64-bit mode:
++ */
++static inline void
++check_v8086_mode(struct pt_regs *regs, unsigned long address,
++               struct task_struct *tsk)
++{
++}
++
++static int bad_address(void *p)
++{
++      unsigned long dummy;
++
++      return probe_kernel_address((unsigned long *)p, dummy);
++}
++
++static void dump_pagetable(unsigned long address, bool kernel)
++{
++      pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
++      pgd_t *pgd = base + pgd_index(address);
++      pud_t *pud;
++      pmd_t *pmd;
++      pte_t *pte;
++
++      if (!kernel)
++              pgd = __user_pgd(base) + pgd_index(address);
++
++      if (bad_address(pgd))
++              goto bad;
++
++      printk("PGD %lx ", pgd_val(*pgd));
++
++      if (!pgd_present(*pgd))
++              goto out;
++
++      pud = pud_offset(pgd, address);
++      if (bad_address(pud))
++              goto bad;
++
++      printk(KERN_CONT "PUD %lx ", pud_val(*pud));
++      if (!pud_present(*pud) || pud_large(*pud))
++              goto out;
++
++      pmd = pmd_offset(pud, address);
++      if (bad_address(pmd))
++              goto bad;
++
++      printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
++      if (!pmd_present(*pmd) || pmd_large(*pmd))
++              goto out;
++
++      pte = pte_offset_kernel(pmd, address);
++      if (bad_address(pte))
++              goto bad;
++
++      printk(KERN_CONT "PTE %lx", pte_val(*pte));
++out:
++      printk(KERN_CONT "\n");
++      return;
++bad:
++      printk("BAD\n");
++}
++
++#endif /* CONFIG_X86_64 */
++
++/*
++ * Workaround for K8 erratum #93 & buggy BIOS.
++ *
++ * BIOS SMM functions are required to use a specific workaround
++ * to avoid corruption of the 64bit RIP register on C stepping K8.
++ *
++ * A lot of BIOS that didn't get tested properly miss this.
++ *
++ * The OS sees this as a page fault with the upper 32bits of RIP cleared.
++ * Try to work around it here.
++ *
++ * Note we only handle faults in kernel here.
++ * Does nothing on 32-bit.
++ */
++static int is_errata93(struct pt_regs *regs, unsigned long address)
++{
++#ifdef CONFIG_X86_64
++      if (address != regs->ip)
++              return 0;
++
++      if ((address >> 32) != 0)
++              return 0;
++
++      address |= 0xffffffffUL << 32;
++      if ((address >= (u64)_stext && address <= (u64)_etext) ||
++          (address >= MODULES_VADDR && address <= MODULES_END)) {
++              printk_once(errata93_warning);
++              regs->ip = address;
++              return 1;
++      }
++#endif
++      return 0;
++}
++
++/*
++ * Work around K8 erratum #100 K8 in compat mode occasionally jumps
++ * to illegal addresses >4GB.
++ *
++ * We catch this in the page fault handler because these addresses
++ * are not reachable. Just detect this case and return.  Any code
++ * segment in LDT is compatibility mode.
++ */
++static int is_errata100(struct pt_regs *regs, unsigned long address)
++{
++#ifdef CONFIG_X86_64
++      if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
++              return 1;
++#endif
++      return 0;
++}
++
++static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
++{
++#ifdef CONFIG_X86_F00F_BUG
++      unsigned long nr;
++
++      /*
++       * Pentium F0 0F C7 C8 bug workaround:
++       */
++      if (boot_cpu_data.f00f_bug) {
++              nr = (address - idt_descr.address) >> 3;
++
++              if (nr == 6) {
++                      do_invalid_op(regs, 0);
++                      return 1;
++              }
++      }
++#endif
++      return 0;
++}
++
++static const char nx_warning[] = KERN_CRIT
++"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
++
++static void
++show_fault_oops(struct pt_regs *regs, unsigned long error_code,
++              unsigned long address)
++{
++      if (!oops_may_print())
++              return;
++
++      if (error_code & PF_INSTR) {
++              unsigned int level;
++
++              pte_t *pte = lookup_address(address, &level);
++
++              if (pte && pte_present(*pte) && !pte_exec(*pte))
++                      printk(nx_warning, current_uid());
++      }
++
++      printk(KERN_ALERT "BUG: unable to handle kernel ");
++      if (address < PAGE_SIZE)
++              printk(KERN_CONT "NULL pointer dereference");
++      else
++              printk(KERN_CONT "paging request");
++
++      printk(KERN_CONT " at %p\n", (void *) address);
++      printk(KERN_ALERT "IP:");
++      printk_address(regs->ip, 1);
++
++      dump_pagetable(address, !(error_code & PF_USER));
++}
++
++static noinline void
++pgtable_bad(struct pt_regs *regs, unsigned long error_code,
++          unsigned long address)
++{
++      struct task_struct *tsk;
++      unsigned long flags;
++      int sig;
++
++      flags = oops_begin();
++      tsk = current;
++      sig = SIGKILL;
++
++      printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
++             tsk->comm, address);
++      dump_pagetable(address, !(error_code & PF_USER));
++
++      tsk->thread.cr2         = address;
++      tsk->thread.trap_no     = 14;
++      tsk->thread.error_code  = error_code;
++
++      if (__die("Bad pagetable", regs, error_code))
++              sig = 0;
++
++      oops_end(flags, regs, sig);
++}
++
++static noinline void
++no_context(struct pt_regs *regs, unsigned long error_code,
++         unsigned long address)
++{
++      struct task_struct *tsk = current;
++      unsigned long *stackend;
++      unsigned long flags;
++      int sig;
++
++      /* Are we prepared to handle this kernel fault? */
++      if (fixup_exception(regs))
++              return;
++
++      /*
++       * 32-bit:
++       *
++       *   Valid to do another page fault here, because if this fault
++       *   had been triggered by is_prefetch fixup_exception would have
++       *   handled it.
++       *
++       * 64-bit:
++       *
++       *   Hall of shame of CPU/BIOS bugs.
++       */
++      if (is_prefetch(regs, error_code, address))
++              return;
++
++      if (is_errata93(regs, address))
++              return;
++
++      /*
++       * Oops. The kernel tried to access some bad page. We'll have to
++       * terminate things with extreme prejudice:
++       */
++      flags = oops_begin();
++
++      show_fault_oops(regs, error_code, address);
++
++      stackend = end_of_stack(tsk);
++      if (tsk != &init_task && *stackend != STACK_END_MAGIC)
++              printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
++
++      tsk->thread.cr2         = address;
++      tsk->thread.trap_no     = 14;
++      tsk->thread.error_code  = error_code;
++
++      sig = SIGKILL;
++      if (__die("Oops", regs, error_code))
++              sig = 0;
++
++      /* Executive summary in case the body of the oops scrolled away */
++      printk(KERN_EMERG "CR2: %016lx\n", address);
++
++      oops_end(flags, regs, sig);
++}
++
++/*
++ * Print out info about fatal segfaults, if the show_unhandled_signals
++ * sysctl is set:
++ */
++static inline void
++show_signal_msg(struct pt_regs *regs, unsigned long error_code,
++              unsigned long address, struct task_struct *tsk)
++{
++      if (!unhandled_signal(tsk, SIGSEGV))
++              return;
++
++      if (!printk_ratelimit())
++              return;
++
++      printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
++              task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
++              tsk->comm, task_pid_nr(tsk), address,
++              (void *)regs->ip, (void *)regs->sp, error_code);
++
++      print_vma_addr(KERN_CONT " in ", regs->ip);
++
++      printk(KERN_CONT "\n");
++}
++
++static void
++__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
++                     unsigned long address, int si_code)
++{
++      struct task_struct *tsk = current;
++
++      /* User mode accesses just cause a SIGSEGV */
++      if (error_code & PF_USER) {
++              /*
++               * It's possible to have interrupts off here:
++               */
++              local_irq_enable();
++
++              /*
++               * Valid to do another page fault here because this one came
++               * from user space:
++               */
++              if (is_prefetch(regs, error_code, address))
++                      return;
++
++              if (is_errata100(regs, address))
++                      return;
++
++              if (unlikely(show_unhandled_signals))
++                      show_signal_msg(regs, error_code, address, tsk);
++
++              /* Kernel addresses are always protection faults: */
++              tsk->thread.cr2         = address;
++              tsk->thread.error_code  = error_code | (address >= TASK_SIZE);
++              tsk->thread.trap_no     = 14;
++
++              force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
++
++              return;
++      }
++
++      if (is_f00f_bug(regs, address))
++              return;
++
++      no_context(regs, error_code, address);
++}
++
++static noinline void
++bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
++                   unsigned long address)
++{
++      __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
++}
++
++static void
++__bad_area(struct pt_regs *regs, unsigned long error_code,
++         unsigned long address, int si_code)
++{
++      struct mm_struct *mm = current->mm;
++
++      /*
++       * Something tried to access memory that isn't in our memory map..
++       * Fix it, but check if it's kernel or user first..
++       */
++      up_read(&mm->mmap_sem);
++
++      __bad_area_nosemaphore(regs, error_code, address, si_code);
++}
++
++static noinline void
++bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
++{
++      __bad_area(regs, error_code, address, SEGV_MAPERR);
++}
++
++static noinline void
++bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
++                    unsigned long address)
++{
++      __bad_area(regs, error_code, address, SEGV_ACCERR);
++}
++
++/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
++static void
++out_of_memory(struct pt_regs *regs, unsigned long error_code,
++            unsigned long address)
++{
++      /*
++       * We ran out of memory, call the OOM killer, and return the userspace
++       * (which will retry the fault, or kill us if we got oom-killed):
++       */
++      up_read(&current->mm->mmap_sem);
++
++      pagefault_out_of_memory();
++}
++
++static void
++do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
++        unsigned int fault)
++{
++      struct task_struct *tsk = current;
++      struct mm_struct *mm = tsk->mm;
++      int code = BUS_ADRERR;
++
++      up_read(&mm->mmap_sem);
++
++      /* Kernel mode? Handle exceptions or die: */
++      if (!(error_code & PF_USER)) {
++              no_context(regs, error_code, address);
++              return;
++      }
++
++      /* User-space => ok to do another page fault: */
++      if (is_prefetch(regs, error_code, address))
++              return;
++
++      tsk->thread.cr2         = address;
++      tsk->thread.error_code  = error_code;
++      tsk->thread.trap_no     = 14;
++
++#ifdef CONFIG_MEMORY_FAILURE
++      if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
++              printk(KERN_ERR
++      "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
++                      tsk->comm, tsk->pid, address);
++              code = BUS_MCEERR_AR;
++      }
++#endif
++      force_sig_info_fault(SIGBUS, code, address, tsk, fault);
++}
++
++static noinline void
++mm_fault_error(struct pt_regs *regs, unsigned long error_code,
++             unsigned long address, unsigned int fault)
++{
++      if (fault & VM_FAULT_OOM) {
++              /* Kernel mode? Handle exceptions or die: */
++              if (!(error_code & PF_USER)) {
++                      up_read(&current->mm->mmap_sem);
++                      no_context(regs, error_code, address);
++                      return;
++              }
++
++              out_of_memory(regs, error_code, address);
++      } else {
++              if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
++                           VM_FAULT_HWPOISON_LARGE))
++                      do_sigbus(regs, error_code, address, fault);
++              else
++                      BUG();
++      }
++}
++
++static int spurious_fault_check(unsigned long error_code, pte_t *pte)
++{
++      if ((error_code & PF_WRITE) && !pte_write(*pte))
++              return 0;
++
++      if ((error_code & PF_INSTR) && !pte_exec(*pte))
++              return 0;
++
++      return 1;
++}
++
++/*
++ * Handle a spurious fault caused by a stale TLB entry.
++ *
++ * This allows us to lazily refresh the TLB when increasing the
++ * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
++ * eagerly is very expensive since that implies doing a full
++ * cross-processor TLB flush, even if no stale TLB entries exist
++ * on other processors.
++ *
++ * There are no security implications to leaving a stale TLB when
++ * increasing the permissions on a page.
++ */
++static noinline __kprobes int
++spurious_fault(unsigned long error_code, unsigned long address)
++{
++      pgd_t *pgd;
++      pud_t *pud;
++      pmd_t *pmd;
++      pte_t *pte;
++      int ret;
++
++      /* Reserved-bit violation or user access to kernel space? */
++      if (error_code & (PF_USER | PF_RSVD))
++              return 0;
++
++      pgd = init_mm.pgd + pgd_index(address);
++      if (!pgd_present(*pgd))
++              return 0;
++
++      pud = pud_offset(pgd, address);
++      if (!pud_present(*pud))
++              return 0;
++
++      if (pud_large(*pud))
++              return spurious_fault_check(error_code, (pte_t *) pud);
++
++      pmd = pmd_offset(pud, address);
++      if (!pmd_present(*pmd))
++              return 0;
++
++      if (pmd_large(*pmd))
++              return spurious_fault_check(error_code, (pte_t *) pmd);
++
++      /*
++       * Note: don't use pte_present() here, since it returns true
++       * if the _PAGE_PROTNONE bit is set.  However, this aliases the
++       * _PAGE_GLOBAL bit, which for kernel pages give false positives
++       * when CONFIG_DEBUG_PAGEALLOC is used.
++       */
++      pte = pte_offset_kernel(pmd, address);
++      if (!(pte_flags(*pte) & _PAGE_PRESENT))
++              return 0;
++
++      ret = spurious_fault_check(error_code, pte);
++      if (!ret)
++              return 0;
++
++      /*
++       * Make sure we have permissions in PMD.
++       * If not, then there's a bug in the page tables:
++       */
++      ret = spurious_fault_check(error_code, (pte_t *) pmd);
++      WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
++
++      return ret;
++}
++
++int show_unhandled_signals = 1;
++
++static inline int
++access_error(unsigned long error_code, struct vm_area_struct *vma)
++{
++      if (error_code & PF_WRITE) {
++              /* write, present and write, not present: */
++              if (unlikely(!(vma->vm_flags & VM_WRITE)))
++                      return 1;
++              return 0;
++      }
++
++      /* read, present: */
++      if (unlikely(error_code & PF_PROT))
++              return 1;
++
++      /* read, not present: */
++      if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
++              return 1;
++
++      return 0;
++}
++
++static int fault_in_kernel_space(unsigned long address)
++{
++      return address >= TASK_SIZE_MAX;
++}
++
++/*
++ * This routine handles page faults.  It determines the address,
++ * and the problem, and then passes it off to one of the appropriate
++ * routines.
++ */
++dotraplinkage void __kprobes
++do_page_fault(struct pt_regs *regs, unsigned long error_code)
++{
++      struct vm_area_struct *vma;
++      struct task_struct *tsk;
++      unsigned long address;
++      struct mm_struct *mm;
++      int fault;
++      int write = error_code & PF_WRITE;
++      unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
++                                      (write ? FAULT_FLAG_WRITE : 0);
++
++      /* Set the "privileged fault" bit to something sane. */
++      if (user_mode_vm(regs))
++              error_code |= PF_USER;
++      else
++              error_code &= ~PF_USER;
++
++      tsk = current;
++      mm = tsk->mm;
++
++      /* Get the faulting address: */
++      address = read_cr2();
++
++      /*
++       * Detect and handle instructions that would cause a page fault for
++       * both a tracked kernel page and a userspace page.
++       */
++      if (kmemcheck_active(regs))
++              kmemcheck_hide(regs);
++      prefetchw(&mm->mmap_sem);
++
++      if (unlikely(kmmio_fault(regs, address)))
++              return;
++
++      /*
++       * We fault-in kernel-space virtual memory on-demand. The
++       * 'reference' page table is init_mm.pgd.
++       *
++       * NOTE! We MUST NOT take any locks for this case. We may
++       * be in an interrupt or a critical region, and should
++       * only copy the information from the master page table,
++       * nothing more.
++       *
++       * This verifies that the fault happens in kernel space
++       * (error_code & 4) == 0, and that the fault was not a
++       * protection error (error_code & 9) == 0.
++       */
++      if (unlikely(fault_in_kernel_space(address))) {
++              /* Faults in hypervisor area can never be patched up. */
++#if defined(CONFIG_X86_XEN)
++              if (address >= hypervisor_virt_start) {
++#elif defined(CONFIG_X86_64_XEN)
++              if (address >= HYPERVISOR_VIRT_START
++                  && address < HYPERVISOR_VIRT_END) {
++#endif
++                      bad_area_nosemaphore(regs, error_code, address);
++                      return;
++              }
++
++              if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
++                      if (vmalloc_fault(address) >= 0)
++                              return;
++
++                      if (kmemcheck_fault(regs, address, error_code))
++                              return;
++              }
++
++              /* Can handle a stale RO->RW TLB: */
++              if (spurious_fault(error_code, address))
++                      return;
++
++              /* kprobes don't want to hook the spurious faults: */
++              if (notify_page_fault(regs))
++                      return;
++              /*
++               * Don't take the mm semaphore here. If we fixup a prefetch
++               * fault we could otherwise deadlock:
++               */
++              bad_area_nosemaphore(regs, error_code, address);
++
++              return;
++      }
++
++      /* kprobes don't want to hook the spurious faults: */
++      if (unlikely(notify_page_fault(regs)))
++              return;
++      /*
++       * It's safe to allow irq's after cr2 has been saved and the
++       * vmalloc fault has been handled.
++       *
++       * User-mode registers count as a user access even for any
++       * potential system fault or CPU buglet:
++       */
++      if (user_mode_vm(regs)) {
++              local_irq_enable();
++              error_code |= PF_USER;
++      } else {
++              if (regs->flags & X86_EFLAGS_IF)
++                      local_irq_enable();
++      }
++
++      if (unlikely(error_code & PF_RSVD))
++              pgtable_bad(regs, error_code, address);
++
++      perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
++
++      /*
++       * If we're in an interrupt, have no user context or are running
++       * in an atomic region then we must not take the fault:
++       */
++      if (unlikely(in_atomic() || !mm)) {
++              bad_area_nosemaphore(regs, error_code, address);
++              return;
++      }
++
++      /*
++       * When running in the kernel we expect faults to occur only to
++       * addresses in user space.  All other faults represent errors in
++       * the kernel and should generate an OOPS.  Unfortunately, in the
++       * case of an erroneous fault occurring in a code path which already
++       * holds mmap_sem we will deadlock attempting to validate the fault
++       * against the address space.  Luckily the kernel only validly
++       * references user space from well defined areas of code, which are
++       * listed in the exceptions table.
++       *
++       * As the vast majority of faults will be valid we will only perform
++       * the source reference check when there is a possibility of a
++       * deadlock. Attempt to lock the address space, if we cannot we then
++       * validate the source. If this is invalid we can skip the address
++       * space check, thus avoiding the deadlock:
++       */
++      if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
++              if ((error_code & PF_USER) == 0 &&
++                  !search_exception_tables(regs->ip)) {
++                      bad_area_nosemaphore(regs, error_code, address);
++                      return;
++              }
++retry:
++              down_read(&mm->mmap_sem);
++      } else {
++              /*
++               * The above down_read_trylock() might have succeeded in
++               * which case we'll have missed the might_sleep() from
++               * down_read():
++               */
++              might_sleep();
++      }
++
++      vma = find_vma(mm, address);
++      if (unlikely(!vma)) {
++              bad_area(regs, error_code, address);
++              return;
++      }
++      if (likely(vma->vm_start <= address))
++              goto good_area;
++      if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
++              bad_area(regs, error_code, address);
++              return;
++      }
++      if (error_code & PF_USER) {
++              /*
++               * Accessing the stack below %sp is always a bug.
++               * The large cushion allows instructions like enter
++               * and pusha to work. ("enter $65535, $31" pushes
++               * 32 pointers and then decrements %sp by 65535.)
++               */
++              if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
++                      bad_area(regs, error_code, address);
++                      return;
++              }
++      }
++      if (unlikely(expand_stack(vma, address))) {
++              bad_area(regs, error_code, address);
++              return;
++      }
++
++      /*
++       * Ok, we have a good vm_area for this memory access, so
++       * we can handle it..
++       */
++good_area:
++      if (unlikely(access_error(error_code, vma))) {
++              bad_area_access_error(regs, error_code, address);
++              return;
++      }
++
++      /*
++       * If for any reason at all we couldn't handle the fault,
++       * make sure we exit gracefully rather than endlessly redo
++       * the fault:
++       */
++      fault = handle_mm_fault(mm, vma, address, flags);
++
++      if (unlikely(fault & VM_FAULT_ERROR)) {
++              mm_fault_error(regs, error_code, address, fault);
++              return;
++      }
++
++      /*
++       * Major/minor page fault accounting is only done on the
++       * initial attempt. If we go through a retry, it is extremely
++       * likely that the page will be found in page cache at that point.
++       */
++      if (flags & FAULT_FLAG_ALLOW_RETRY) {
++              if (fault & VM_FAULT_MAJOR) {
++                      tsk->maj_flt++;
++                      perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
++                                    regs, address);
++              } else {
++                      tsk->min_flt++;
++                      perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
++                                    regs, address);
++              }
++              if (fault & VM_FAULT_RETRY) {
++                      /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
++                       * of starvation. */
++                      flags &= ~FAULT_FLAG_ALLOW_RETRY;
++                      goto retry;
++              }
++      }
++
++      check_v8086_mode(regs, address, tsk);
++
++      up_read(&mm->mmap_sem);
++}
diff --cc arch/x86/mm/highmem_32-xen.c

index 0000000,0000000..ee75cf8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/highmem_32-xen.c
@@@ -1,0 -1,0 +1,194 @@@
++#include <linux/highmem.h>
++#include <linux/module.h>
++#include <linux/swap.h> /* for totalram_pages */
++
++void *kmap(struct page *page)
++{
++      might_sleep();
++      if (!PageHighMem(page))
++              return page_address(page);
++      return kmap_high(page);
++}
++EXPORT_SYMBOL(kmap);
++
++void kunmap(struct page *page)
++{
++      if (in_interrupt())
++              BUG();
++      if (!PageHighMem(page))
++              return;
++      kunmap_high(page);
++}
++EXPORT_SYMBOL(kunmap);
++
++/*
++ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
++ * no global lock is needed and because the kmap code must perform a global TLB
++ * invalidation when the kmap pool wraps.
++ *
++ * However when holding an atomic kmap it is not legal to sleep, so atomic
++ * kmaps are appropriate for short, tight code paths only.
++ */
++void *kmap_atomic_prot(struct page *page, pgprot_t prot)
++{
++      unsigned long vaddr;
++      int idx, type;
++
++      /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
++      pagefault_disable();
++
++      if (!PageHighMem(page))
++              return page_address(page);
++
++      type = kmap_atomic_idx_push();
++      idx = type + KM_TYPE_NR*smp_processor_id();
++      vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
++      BUG_ON(!pte_none(*(kmap_pte-idx)));
++      set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
++
++      return (void *)vaddr;
++}
++EXPORT_SYMBOL(kmap_atomic_prot);
++
++void *__kmap_atomic(struct page *page)
++{
++      return kmap_atomic_prot(page, kmap_prot);
++}
++EXPORT_SYMBOL(__kmap_atomic);
++
++/*
++ * This is the same as kmap_atomic() but can map memory that doesn't
++ * have a struct page associated with it.
++ */
++void *kmap_atomic_pfn(unsigned long pfn)
++{
++      return kmap_atomic_prot_pfn(pfn, kmap_prot);
++}
++EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
++
++void __kunmap_atomic(void *kvaddr)
++{
++      unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
++
++      if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
++          vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
++              int idx, type;
++
++              type = kmap_atomic_idx();
++              idx = type + KM_TYPE_NR * smp_processor_id();
++
++#ifdef CONFIG_DEBUG_HIGHMEM
++              WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
++#endif
++              /*
++               * Force other mappings to Oops if they'll try to access this
++               * pte without first remap it.  Keeping stale mappings around
++               * is a bad idea also, in case the page changes cacheability
++               * attributes or becomes a protected page in a hypervisor.
++               */
++              kpte_clear_flush(kmap_pte-idx, vaddr);
++              kmap_atomic_idx_pop();
++      }
++#ifdef CONFIG_DEBUG_HIGHMEM
++      else {
++              BUG_ON(vaddr < PAGE_OFFSET);
++              BUG_ON(vaddr >= (unsigned long)high_memory);
++      }
++#endif
++
++      pagefault_enable();
++}
++EXPORT_SYMBOL(__kunmap_atomic);
++
++struct page *kmap_atomic_to_page(void *ptr)
++{
++      unsigned long idx, vaddr = (unsigned long)ptr;
++      pte_t *pte;
++
++      if (vaddr < FIXADDR_START)
++              return virt_to_page(ptr);
++
++      idx = virt_to_fix(vaddr);
++      pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
++      return pte_page(*pte);
++}
++EXPORT_SYMBOL(kmap_atomic_to_page);
++
++void clear_highpage(struct page *page)
++{
++      void *kaddr;
++
++      if (likely(xen_feature(XENFEAT_highmem_assist))
++          && PageHighMem(page)) {
++              struct mmuext_op meo;
++
++              meo.cmd = MMUEXT_CLEAR_PAGE;
++              meo.arg1.mfn = pfn_to_mfn(page_to_pfn(page));
++              if (HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
++                      return;
++      }
++
++      kaddr = kmap_atomic(page, KM_USER0);
++      clear_page(kaddr);
++      kunmap_atomic(kaddr, KM_USER0);
++}
++EXPORT_SYMBOL(clear_highpage);
++
++void copy_highpage(struct page *to, struct page *from)
++{
++      void *vfrom, *vto;
++
++      if (likely(xen_feature(XENFEAT_highmem_assist))
++          && (PageHighMem(from) || PageHighMem(to))) {
++              unsigned long from_pfn = page_to_pfn(from);
++              unsigned long to_pfn = page_to_pfn(to);
++              struct mmuext_op meo;
++
++              meo.cmd = MMUEXT_COPY_PAGE;
++              meo.arg1.mfn = pfn_to_mfn(to_pfn);
++              meo.arg2.src_mfn = pfn_to_mfn(from_pfn);
++              if (mfn_to_pfn(meo.arg2.src_mfn) == from_pfn
++                  && mfn_to_pfn(meo.arg1.mfn) == to_pfn
++                  && HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
++                      return;
++      }
++
++      vfrom = kmap_atomic(from, KM_USER0);
++      vto = kmap_atomic(to, KM_USER1);
++      copy_page(vto, vfrom);
++      kunmap_atomic(vfrom, KM_USER0);
++      kunmap_atomic(vto, KM_USER1);
++}
++EXPORT_SYMBOL(copy_highpage);
++
++void __init set_highmem_pages_init(void)
++{
++      struct zone *zone;
++      int nid;
++
++      for_each_zone(zone) {
++              unsigned long zone_start_pfn, zone_end_pfn;
++
++              if (!is_highmem(zone))
++                      continue;
++
++              zone_start_pfn = zone->zone_start_pfn;
++              zone_end_pfn = zone_start_pfn + zone->spanned_pages;
++
++              nid = zone_to_nid(zone);
++              printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
++                              zone->name, nid, zone_start_pfn, zone_end_pfn);
++
++              add_highpages_with_active_regions(nid, zone_start_pfn,
++                               zone_end_pfn);
++
++              /* XEN: init high-mem pages outside initial allocation. */
++              if (zone_start_pfn < xen_start_info->nr_pages)
++                      zone_start_pfn = xen_start_info->nr_pages;
++              for (; zone_start_pfn < zone_end_pfn; zone_start_pfn++) {
++                      ClearPageReserved(pfn_to_page(zone_start_pfn));
++                      init_page_count(pfn_to_page(zone_start_pfn));
++              }
++      }
++      totalram_pages += totalhigh_pages;
++}
diff --cc arch/x86/mm/hypervisor.c

index 0000000,0000000..53f12b7

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/hypervisor.c
@@@ -1,0 -1,0 +1,1240 @@@
++/******************************************************************************
++ * mm/hypervisor.c
++ * 
++ * Update page tables via the hypervisor.
++ * 
++ * Copyright (c) 2002-2004, K A Fraser
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/sched.h>
++#include <linux/hardirq.h>
++#include <linux/mm.h>
++#include <linux/vmalloc.h>
++#include <asm/page.h>
++#include <asm/pgtable.h>
++#include <asm/setup.h>
++#include <asm/hypervisor.h>
++#include <xen/balloon.h>
++#include <xen/features.h>
++#include <xen/interface/memory.h>
++#include <xen/interface/vcpu.h>
++#include <linux/module.h>
++#include <linux/percpu.h>
++#include <asm/tlbflush.h>
++#include <linux/highmem.h>
++#ifdef CONFIG_X86_32
++#include <linux/bootmem.h> /* for max_pfn */
++#endif
++
++EXPORT_SYMBOL(hypercall_page);
++
++shared_info_t *__read_mostly HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
++#ifndef CONFIG_XEN_VCPU_INFO_PLACEMENT
++EXPORT_SYMBOL(HYPERVISOR_shared_info);
++#else
++DEFINE_PER_CPU(struct vcpu_info, vcpu_info) __aligned(sizeof(struct vcpu_info));
++EXPORT_PER_CPU_SYMBOL(vcpu_info);
++
++void __ref setup_vcpu_info(unsigned int cpu)
++{
++      struct vcpu_info *v = &per_cpu(vcpu_info, cpu);
++      struct vcpu_register_vcpu_info info;
++#ifdef CONFIG_X86_64
++      static bool first = true;
++
++      if (first) {
++              first = false;
++              info.mfn = early_arbitrary_virt_to_mfn(v);
++      } else
++#endif
++              info.mfn = arbitrary_virt_to_mfn(v);
++      info.offset = offset_in_page(v);
++
++      if (HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info))
++              BUG();
++}
++
++void __init adjust_boot_vcpu_info(void)
++{
++      unsigned long lpfn, rpfn, lmfn, rmfn;
++      pte_t *lpte, *rpte;
++      unsigned int level;
++      mmu_update_t mmu[2];
++
++      /*
++       * setup_vcpu_info() cannot be used more than once for a given (v)CPU,
++       * hence we must swap the underlying MFNs of the two pages holding old
++       * and new vcpu_info of the boot CPU.
++       *
++       * Do *not* use __get_cpu_var() or percpu_{write,...}() here, as the per-
++       * CPU segment didn't get reloaded yet. Using percpu_read(), as in
++       * arch_use_lazy_mmu_mode(), though undesirable, is safe except for the
++       * accesses to variables that were updated in setup_percpu_areas().
++       */
++      lpte = lookup_address((unsigned long)&vcpu_info
++                            + (__per_cpu_load - __per_cpu_start),
++                            &level);
++      rpte = lookup_address((unsigned long)&per_cpu(vcpu_info, 0), &level);
++      BUG_ON(!lpte || !(pte_flags(*lpte) & _PAGE_PRESENT));
++      BUG_ON(!rpte || !(pte_flags(*rpte) & _PAGE_PRESENT));
++      lmfn = __pte_mfn(*lpte);
++      rmfn = __pte_mfn(*rpte);
++
++      if (lmfn == rmfn)
++              return;
++
++      lpfn = mfn_to_local_pfn(lmfn);
++      rpfn = mfn_to_local_pfn(rmfn);
++
++      pr_info("Swapping MFNs for PFN %lx and %lx (MFN %lx and %lx)\n",
++              lpfn, rpfn, lmfn, rmfn);
++
++      xen_l1_entry_update(lpte, pfn_pte_ma(rmfn, pte_pgprot(*lpte)));
++      xen_l1_entry_update(rpte, pfn_pte_ma(lmfn, pte_pgprot(*rpte)));
++#ifdef CONFIG_X86_64
++      if (HYPERVISOR_update_va_mapping((unsigned long)__va(lpfn<<PAGE_SHIFT),
++                                       pfn_pte_ma(rmfn, PAGE_KERNEL_RO), 0))
++              BUG();
++#endif
++      if (HYPERVISOR_update_va_mapping((unsigned long)__va(rpfn<<PAGE_SHIFT),
++                                       pfn_pte_ma(lmfn, PAGE_KERNEL),
++                                       UVMF_TLB_FLUSH))
++              BUG();
++
++      set_phys_to_machine(lpfn, rmfn);
++      set_phys_to_machine(rpfn, lmfn);
++
++      mmu[0].ptr = ((uint64_t)lmfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
++      mmu[0].val = rpfn;
++      mmu[1].ptr = ((uint64_t)rmfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
++      mmu[1].val = lpfn;
++      if (HYPERVISOR_mmu_update(mmu, 2, NULL, DOMID_SELF))
++              BUG();
++
++      /*
++       * Copy over all contents of the page just replaced, except for the
++       * vcpu_info itself, as it may have got updated after having been
++       * copied from __per_cpu_load[].
++       */
++      memcpy(__va(rpfn << PAGE_SHIFT),
++             __va(lpfn << PAGE_SHIFT),
++             (unsigned long)&vcpu_info & (PAGE_SIZE - 1));
++      level = (unsigned long)(&vcpu_info + 1) & (PAGE_SIZE - 1);
++      if (level)
++              memcpy(__va(rpfn << PAGE_SHIFT) + level,
++                     __va(lpfn << PAGE_SHIFT) + level,
++                     PAGE_SIZE - level);
++}
++#endif
++
++#define NR_MC     BITS_PER_LONG
++#define NR_MMU    BITS_PER_LONG
++#define NR_MMUEXT (BITS_PER_LONG / 4)
++
++DEFINE_PER_CPU(bool, xen_lazy_mmu);
++struct lazy_mmu {
++      unsigned int nr_mc, nr_mmu, nr_mmuext;
++      multicall_entry_t mc[NR_MC];
++      mmu_update_t mmu[NR_MMU];
++      struct mmuext_op mmuext[NR_MMUEXT];
++};
++static DEFINE_PER_CPU(struct lazy_mmu, lazy_mmu);
++
++static inline bool use_lazy_mmu_mode(void)
++{
++#ifdef CONFIG_PREEMPT
++      if (!preempt_count())
++              return false;
++#endif
++      return !irq_count();
++}
++
++static void multicall_failed(const multicall_entry_t *mc, int rc)
++{
++      pr_emerg("hypercall#%lu(%lx, %lx, %lx, %lx) failed: %d"
++               " (caller %lx)\n",
++             mc->op, mc->args[0], mc->args[1], mc->args[2], mc->args[3],
++             rc, mc->args[5]);
++      BUG();
++}
++
++static int _xen_multicall_flush(bool ret_last) {
++      struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
++      multicall_entry_t *mc = lazy->mc;
++      unsigned int count = lazy->nr_mc;
++
++      if (!count)
++              return 0;
++
++      lazy->nr_mc = 0;
++      lazy->nr_mmu = 0;
++      lazy->nr_mmuext = 0;
++
++      if (count == 1) {
++              int rc = _hypercall(int, mc->op, mc->args[0], mc->args[1],
++                                  mc->args[2], mc->args[3], mc->args[4]);
++
++              if (unlikely(rc)) {
++                      if (ret_last)
++                              return rc;
++                      multicall_failed(mc, rc);
++              }
++      } else {
++              if (HYPERVISOR_multicall(mc, count))
++                      BUG();
++              while (count-- > ret_last)
++                      if (unlikely(mc++->result))
++                              multicall_failed(mc - 1, mc[-1].result);
++              if (ret_last)
++                      return mc->result;
++      }
++
++      return 0;
++}
++
++void xen_multicall_flush(void) {
++      if (use_lazy_mmu_mode())
++              _xen_multicall_flush(false);
++}
++
++int xen_multi_update_va_mapping(unsigned long va, pte_t pte,
++                              unsigned long uvmf)
++{
++      struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
++      multicall_entry_t *mc;
++
++      if (unlikely(!use_lazy_mmu_mode()))
++#ifdef CONFIG_X86_PAE
++              return _hypercall4(int, update_va_mapping, va,
++                                 pte.pte_low, pte.pte_high, uvmf);
++#else
++              return _hypercall3(int, update_va_mapping, va,
++                                 pte.pte, uvmf);
++#endif
++
++      if (unlikely(lazy->nr_mc == NR_MC))
++              _xen_multicall_flush(false);
++
++      mc = lazy->mc + lazy->nr_mc++;
++      mc->op = __HYPERVISOR_update_va_mapping;
++      mc->args[0] = va;
++#ifndef CONFIG_X86_PAE
++      mc->args[1] = pte.pte;
++#else
++      mc->args[1] = pte.pte_low;
++      mc->args[2] = pte.pte_high;
++#endif
++      mc->args[MULTI_UVMFLAGS_INDEX] = uvmf;
++      mc->args[5] = (long)__builtin_return_address(0);
++
++      return 0;
++}
++
++static inline bool mmu_may_merge(const multicall_entry_t *mc,
++                               unsigned int op, domid_t domid)
++{
++      return mc->op == op && !mc->args[2] && mc->args[3] == domid;
++}
++
++int xen_multi_mmu_update(mmu_update_t *src, unsigned int count,
++                       unsigned int *success_count, domid_t domid)
++{
++      struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
++      multicall_entry_t *mc = lazy->mc + lazy->nr_mc;
++      mmu_update_t *dst;
++      bool commit, merge;
++
++      if (unlikely(!use_lazy_mmu_mode()))
++              return _hypercall4(int, mmu_update, src, count,
++                                 success_count, domid);
++
++      commit = (lazy->nr_mmu + count) > NR_MMU || success_count;
++      merge = lazy->nr_mc && !commit
++              && mmu_may_merge(mc - 1, __HYPERVISOR_mmu_update, domid);
++      if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
++              _xen_multicall_flush(false);
++              mc = lazy->mc;
++              commit = count > NR_MMU || success_count;
++      }
++
++      if (!lazy->nr_mc && unlikely(commit))
++              return _hypercall4(int, mmu_update, src, count,
++                                 success_count, domid);
++
++      dst = lazy->mmu + lazy->nr_mmu;
++      lazy->nr_mmu += count;
++      if (merge) {
++              mc[-1].args[1] += count;
++              memcpy(dst, src, count * sizeof(*src));
++      } else {
++              ++lazy->nr_mc;
++              mc->op = __HYPERVISOR_mmu_update;
++              if (!commit) {
++                      mc->args[0] = (unsigned long)dst;
++                      memcpy(dst, src, count * sizeof(*src));
++              } else
++                      mc->args[0] = (unsigned long)src;
++              mc->args[1] = count;
++              mc->args[2] = (unsigned long)success_count;
++              mc->args[3] = domid;
++              mc->args[5] = (long)__builtin_return_address(0);
++      }
++
++      while (!commit && count--)
++              switch (src++->ptr & (sizeof(pteval_t) - 1)) {
++              case MMU_NORMAL_PT_UPDATE:
++              case MMU_PT_UPDATE_PRESERVE_AD:
++                      break;
++              default:
++                      commit = true;
++                      break;
++              }
++
++      return commit ? _xen_multicall_flush(true) : 0;
++}
++
++int xen_multi_mmuext_op(struct mmuext_op *src, unsigned int count,
++                      unsigned int *success_count, domid_t domid)
++{
++      struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
++      multicall_entry_t *mc;
++      struct mmuext_op *dst;
++      bool commit, merge;
++
++      if (unlikely(!use_lazy_mmu_mode()))
++              return _hypercall4(int, mmuext_op, src, count,
++                                 success_count, domid);
++
++      /*
++       * While it could be useful in theory, I've never seen the body of
++       * this conditional to be reached, hence it seems more reasonable
++       * to disable it for the time being.
++       */
++      if (0 && likely(count)
++          && likely(!success_count)
++          && likely(domid == DOMID_SELF)
++          && likely(lazy->nr_mc)
++          && lazy->mc[lazy->nr_mc - 1].op == __HYPERVISOR_update_va_mapping) {
++              unsigned long oldf, newf = UVMF_NONE;
++
++              switch (src->cmd) {
++              case MMUEXT_TLB_FLUSH_ALL:
++                      newf = UVMF_TLB_FLUSH | UVMF_ALL;
++                      break;
++              case MMUEXT_INVLPG_ALL:
++                      newf = UVMF_INVLPG | UVMF_ALL;
++                      break;
++              case MMUEXT_TLB_FLUSH_MULTI:
++                      newf = UVMF_TLB_FLUSH | UVMF_MULTI
++                             | (unsigned long)src->arg2.vcpumask.p;
++                      break;
++              case MMUEXT_INVLPG_MULTI:
++                      newf = UVMF_INVLPG | UVMF_MULTI
++                             | (unsigned long)src->arg2.vcpumask.p;
++                      break;
++              case MMUEXT_TLB_FLUSH_LOCAL:
++                      newf = UVMF_TLB_FLUSH | UVMF_LOCAL;
++                      break;
++              case MMUEXT_INVLPG_LOCAL:
++                      newf = UVMF_INVLPG | UVMF_LOCAL;
++                      break;
++              }
++              mc = lazy->mc + lazy->nr_mc - 1;
++              oldf = mc->args[MULTI_UVMFLAGS_INDEX];
++              if (newf == UVMF_NONE || oldf == UVMF_NONE
++                  || newf == (UVMF_TLB_FLUSH | UVMF_ALL))
++                      ;
++              else if (oldf == (UVMF_TLB_FLUSH | UVMF_ALL))
++                      newf = UVMF_TLB_FLUSH | UVMF_ALL;
++              else if ((newf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG
++                       && (oldf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG
++                       && ((src->arg1.linear_addr ^ mc->args[0])
++                           >> PAGE_SHIFT))
++                      newf = UVMF_NONE;
++              else if (((oldf | newf) & UVMF_ALL)
++                       && !((oldf ^ newf) & UVMF_FLUSHTYPE_MASK))
++                      newf |= UVMF_ALL;
++              else if ((oldf ^ newf) & ~UVMF_FLUSHTYPE_MASK)
++                      newf = UVMF_NONE;
++              else if ((oldf & UVMF_FLUSHTYPE_MASK) == UVMF_TLB_FLUSH)
++                      newf = (newf & ~UVMF_FLUSHTYPE_MASK) | UVMF_TLB_FLUSH;
++              else if ((newf & UVMF_FLUSHTYPE_MASK) != UVMF_TLB_FLUSH
++                       && ((newf ^ oldf) & UVMF_FLUSHTYPE_MASK))
++                      newf = UVMF_NONE;
++              if (newf != UVMF_NONE) {
++                      mc->args[MULTI_UVMFLAGS_INDEX] = newf;
++                      ++src;
++                      if (!--count)
++                              return 0;
++              }
++      }
++
++      mc = lazy->mc + lazy->nr_mc;
++      commit = (lazy->nr_mmuext + count) > NR_MMUEXT || success_count;
++      merge = lazy->nr_mc && !commit
++              && mmu_may_merge(mc - 1, __HYPERVISOR_mmuext_op, domid);
++      if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
++              _xen_multicall_flush(false);
++              mc = lazy->mc;
++              commit = count > NR_MMUEXT || success_count;
++      }
++
++      if (!lazy->nr_mc && unlikely(commit))
++              return _hypercall4(int, mmuext_op, src, count,
++                                 success_count, domid);
++
++      dst = lazy->mmuext + lazy->nr_mmuext;
++      lazy->nr_mmuext += count;
++      if (merge) {
++              mc[-1].args[1] += count;
++              memcpy(dst, src, count * sizeof(*src));
++      } else {
++              ++lazy->nr_mc;
++              mc->op = __HYPERVISOR_mmuext_op;
++              if (!commit) {
++                      mc->args[0] = (unsigned long)dst;
++                      memcpy(dst, src, count * sizeof(*src));
++              } else
++                      mc->args[0] = (unsigned long)src;
++              mc->args[1] = count;
++              mc->args[2] = (unsigned long)success_count;
++              mc->args[3] = domid;
++              mc->args[5] = (long)__builtin_return_address(0);
++      }
++
++      while (!commit && count--)
++              switch (src++->cmd) {
++              case MMUEXT_PIN_L1_TABLE:
++              case MMUEXT_PIN_L2_TABLE:
++              case MMUEXT_PIN_L3_TABLE:
++              case MMUEXT_PIN_L4_TABLE:
++              case MMUEXT_UNPIN_TABLE:
++              case MMUEXT_TLB_FLUSH_LOCAL:
++              case MMUEXT_INVLPG_LOCAL:
++              case MMUEXT_TLB_FLUSH_MULTI:
++              case MMUEXT_INVLPG_MULTI:
++              case MMUEXT_TLB_FLUSH_ALL:
++              case MMUEXT_INVLPG_ALL:
++                      break;
++              default:
++                      commit = true;
++                      break;
++              }
++
++      return commit ? _xen_multicall_flush(true) : 0;
++}
++
++void xen_l1_entry_update(pte_t *ptr, pte_t val)
++{
++      mmu_update_t u;
++      u.ptr = ptep_to_machine(ptr);
++      u.val = __pte_val(val);
++      BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
++}
++EXPORT_SYMBOL_GPL(xen_l1_entry_update);
++
++static void do_lN_entry_update(mmu_update_t *mmu, unsigned int mmu_count,
++                               struct page *page)
++{
++      if (likely(page)) {
++              multicall_entry_t mcl[2];
++              unsigned long pfn = page_to_pfn(page);
++
++              MULTI_update_va_mapping(mcl,
++                                      (unsigned long)__va(pfn << PAGE_SHIFT),
++                                      pfn_pte(pfn, PAGE_KERNEL_RO), 0);
++              SetPagePinned(page);
++              MULTI_mmu_update(mcl + 1, mmu, mmu_count, NULL, DOMID_SELF);
++              if (unlikely(HYPERVISOR_multicall_check(mcl, 2, NULL)))
++                      BUG();
++      } else if (unlikely(HYPERVISOR_mmu_update(mmu, mmu_count,
++                                                NULL, DOMID_SELF) < 0))
++              BUG();
++}
++
++void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
++{
++      mmu_update_t u;
++      struct page *page = NULL;
++
++      if (likely(pmd_present(val)) && likely(!pmd_large(val))
++          && likely(mem_map)
++          && likely(PagePinned(virt_to_page(ptr)))) {
++              page = pmd_page(val);
++              if (unlikely(PagePinned(page)))
++                      page = NULL;
++              else if (PageHighMem(page)) {
++#ifndef CONFIG_HIGHPTE
++                      BUG();
++#endif
++                      kmap_flush_unused();
++                      page = NULL;
++              }
++      }
++      u.ptr = virt_to_machine(ptr);
++      u.val = __pmd_val(val);
++      do_lN_entry_update(&u, 1, page);
++}
++
++#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
++void xen_l3_entry_update(pud_t *ptr, pud_t val)
++{
++      mmu_update_t u;
++      struct page *page = NULL;
++
++      if (likely(pud_present(val))
++#ifdef CONFIG_X86_64
++          && likely(!pud_large(val))
++#endif
++          && likely(mem_map)
++          && likely(PagePinned(virt_to_page(ptr)))) {
++              page = pud_page(val);
++              if (unlikely(PagePinned(page)))
++                      page = NULL;
++      }
++      u.ptr = virt_to_machine(ptr);
++      u.val = __pud_val(val);
++      do_lN_entry_update(&u, 1, page);
++}
++#endif
++
++#ifdef CONFIG_X86_64
++void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
++{
++      mmu_update_t u[2];
++      struct page *page = NULL;
++
++      if (likely(pgd_present(val)) && likely(mem_map)
++          && likely(PagePinned(virt_to_page(ptr)))) {
++              page = pgd_page(val);
++              if (unlikely(PagePinned(page)))
++                      page = NULL;
++      }
++      u[0].ptr = virt_to_machine(ptr);
++      u[0].val = __pgd_val(val);
++      if (((unsigned long)ptr & ~PAGE_MASK)
++          <= pgd_index(TASK_SIZE_MAX) * sizeof(*ptr)) {
++              ptr = __user_pgd(ptr);
++              BUG_ON(!ptr);
++              u[1].ptr = virt_to_machine(ptr);
++              u[1].val = __pgd_val(val);
++              do_lN_entry_update(u, 2, page);
++      } else
++              do_lN_entry_update(u, 1, page);
++}
++#endif /* CONFIG_X86_64 */
++
++#ifdef CONFIG_X86_64
++void xen_pt_switch(pgd_t *pgd)
++{
++      struct mmuext_op op;
++      op.cmd = MMUEXT_NEW_BASEPTR;
++      op.arg1.mfn = virt_to_mfn(pgd);
++      BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
++}
++
++void xen_new_user_pt(pgd_t *pgd)
++{
++      struct mmuext_op op;
++
++      pgd = __user_pgd(pgd);
++      op.cmd = MMUEXT_NEW_USER_BASEPTR;
++      op.arg1.mfn = pgd ? virt_to_mfn(pgd) : 0;
++      BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
++}
++#endif
++
++void xen_tlb_flush(void)
++{
++      struct mmuext_op op;
++      op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
++      BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
++}
++EXPORT_SYMBOL(xen_tlb_flush);
++
++void xen_invlpg(unsigned long ptr)
++{
++      struct mmuext_op op;
++      op.cmd = MMUEXT_INVLPG_LOCAL;
++      op.arg1.linear_addr = ptr & PAGE_MASK;
++      BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
++}
++EXPORT_SYMBOL(xen_invlpg);
++
++#ifdef CONFIG_SMP
++
++void xen_tlb_flush_all(void)
++{
++      struct mmuext_op op;
++      op.cmd = MMUEXT_TLB_FLUSH_ALL;
++      BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
++}
++EXPORT_SYMBOL_GPL(xen_tlb_flush_all);
++
++void xen_tlb_flush_mask(const cpumask_t *mask)
++{
++      struct mmuext_op op;
++      if ( cpus_empty(*mask) )
++              return;
++      op.cmd = MMUEXT_TLB_FLUSH_MULTI;
++      set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
++      BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
++}
++EXPORT_SYMBOL_GPL(xen_tlb_flush_mask);
++
++void xen_invlpg_all(unsigned long ptr)
++{
++      struct mmuext_op op;
++      op.cmd = MMUEXT_INVLPG_ALL;
++      op.arg1.linear_addr = ptr & PAGE_MASK;
++      BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
++}
++EXPORT_SYMBOL_GPL(xen_invlpg_all);
++
++void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr)
++{
++      struct mmuext_op op;
++      if ( cpus_empty(*mask) )
++              return;
++      op.cmd = MMUEXT_INVLPG_MULTI;
++      op.arg1.linear_addr = ptr & PAGE_MASK;
++      set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
++      BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
++}
++EXPORT_SYMBOL_GPL(xen_invlpg_mask);
++
++#endif /* CONFIG_SMP */
++
++#ifdef CONFIG_X86_64
++#define NR_PGD_PIN_OPS 2
++#else
++#define NR_PGD_PIN_OPS 1
++#endif
++
++void xen_pgd_pin(pgd_t *pgd)
++{
++      struct mmuext_op op[NR_PGD_PIN_OPS];
++
++      op[0].cmd = MMUEXT_PIN_L3_TABLE;
++      op[0].arg1.mfn = virt_to_mfn(pgd);
++#ifdef CONFIG_X86_64
++      op[1].cmd = op[0].cmd = MMUEXT_PIN_L4_TABLE;
++      pgd = __user_pgd(pgd);
++      if (pgd)
++              op[1].arg1.mfn = virt_to_mfn(pgd);
++      else {
++              op[1].cmd = MMUEXT_PIN_L3_TABLE;
++              op[1].arg1.mfn = pfn_to_mfn(__pa_symbol(level3_user_pgt)
++                                          >> PAGE_SHIFT);
++      }
++#endif
++      if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
++              BUG();
++}
++
++void xen_pgd_unpin(pgd_t *pgd)
++{
++      struct mmuext_op op[NR_PGD_PIN_OPS];
++
++      op[0].cmd = MMUEXT_UNPIN_TABLE;
++      op[0].arg1.mfn = virt_to_mfn(pgd);
++#ifdef CONFIG_X86_64
++      pgd = __user_pgd(pgd);
++      BUG_ON(!pgd);
++      op[1].cmd = MMUEXT_UNPIN_TABLE;
++      op[1].arg1.mfn = virt_to_mfn(pgd);
++#endif
++      if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
++              BUG();
++}
++
++void xen_set_ldt(const void *ptr, unsigned int ents)
++{
++      struct mmuext_op op;
++      op.cmd = MMUEXT_SET_LDT;
++      op.arg1.linear_addr = (unsigned long)ptr;
++      op.arg2.nr_ents     = ents;
++      BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
++}
++
++/* Protected by balloon_lock. */
++#define MAX_CONTIG_ORDER 9 /* 2MB */
++static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
++static unsigned long limited_frames[1<<MAX_CONTIG_ORDER];
++static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER];
++
++/* Ensure multi-page extents are contiguous in machine memory. */
++int xen_create_contiguous_region(
++      unsigned long vstart, unsigned int order, unsigned int address_bits)
++{
++      unsigned long *in_frames = discontig_frames, out_frame;
++      unsigned long  frame, flags;
++      unsigned int   i;
++      int            rc, success;
++#ifdef CONFIG_64BIT
++      pte_t         *ptep = NULL;
++#endif
++      struct xen_memory_exchange exchange = {
++              .in = {
++                      .nr_extents   = 1UL << order,
++                      .extent_order = 0,
++                      .domid        = DOMID_SELF
++              },
++              .out = {
++                      .nr_extents   = 1,
++                      .extent_order = order,
++                      .address_bits = address_bits,
++                      .domid        = DOMID_SELF
++              }
++      };
++
++      /*
++       * Currently an auto-translated guest will not perform I/O, nor will
++       * it require PAE page directories below 4GB. Therefore any calls to
++       * this function are redundant and can be ignored.
++       */
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              return 0;
++
++      if (unlikely(order > MAX_CONTIG_ORDER))
++              return -ENOMEM;
++
++#ifdef CONFIG_64BIT
++      if (unlikely(vstart > PAGE_OFFSET + MAXMEM)) {
++              unsigned int level;
++
++              if (vstart < __START_KERNEL_map
++                  || vstart + (PAGE_SIZE << order) > _brk_end)
++                      return -EINVAL;
++              ptep = lookup_address((unsigned long)__va(__pa(vstart)),
++                                    &level);
++              if (ptep && pte_none(*ptep))
++                      ptep = NULL;
++              if (vstart < __START_KERNEL && ptep)
++                      return -EINVAL;
++              if (order > MAX_CONTIG_ORDER - 1)
++                      return -ENOMEM;
++      }
++#else
++      if (unlikely(vstart + (PAGE_SIZE << order) > (unsigned long)high_memory))
++              return -EINVAL;
++#endif
++
++      set_xen_guest_handle(exchange.in.extent_start, in_frames);
++      set_xen_guest_handle(exchange.out.extent_start, &out_frame);
++
++      scrub_pages((void *)vstart, 1 << order);
++
++      balloon_lock(flags);
++
++      /* 1. Zap current PTEs, remembering MFNs. */
++      for (i = 0; i < (1U<<order); i++) {
++              in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
++              MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
++                                      __pte_ma(0), 0);
++#ifdef CONFIG_64BIT
++              if (ptep)
++                      MULTI_update_va_mapping(cr_mcl + i + (1U << order),
++                              (unsigned long)__va(__pa(vstart)) + (i*PAGE_SIZE),
++                              __pte_ma(0), 0);
++#endif
++              set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
++                      INVALID_P2M_ENTRY);
++      }
++#ifdef CONFIG_64BIT
++      if (ptep)
++              i += i;
++#endif
++      if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
++              BUG();
++
++      /* 2. Get a new contiguous memory extent. */
++      out_frame = __pa(vstart) >> PAGE_SHIFT;
++      rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
++      success = (exchange.nr_exchanged == (1UL << order));
++      BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
++      BUG_ON(success && (rc != 0));
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (unlikely(rc == -ENOSYS)) {
++              /* Compatibility when XENMEM_exchange is unsupported. */
++              if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
++                                       &exchange.in) != (1UL << order))
++                      BUG();
++              success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
++                                              &exchange.out) == 1);
++              if (!success) {
++                      /* Couldn't get special memory: fall back to normal. */
++                      for (i = 0; i < (1U<<order); i++)
++                              in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
++                      if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
++                                               &exchange.in) != (1UL<<order))
++                              BUG();
++              }
++      }
++#endif
++
++      /* 3. Map the new extent in place of old pages. */
++      for (i = 0; i < (1U<<order); i++) {
++              frame = success ? (out_frame + i) : in_frames[i];
++              MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
++                                      pfn_pte_ma(frame, PAGE_KERNEL), 0);
++#ifdef CONFIG_64BIT
++              if (ptep)
++                      MULTI_update_va_mapping(cr_mcl + i + (1U << order),
++                              (unsigned long)__va(__pa(vstart)) + (i*PAGE_SIZE),
++                              pfn_pte_ma(frame, PAGE_KERNEL_RO), 0);
++#endif
++              set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
++      }
++#ifdef CONFIG_64BIT
++      if (ptep)
++              i += i;
++#endif
++      cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
++                                                 ? UVMF_TLB_FLUSH|UVMF_ALL
++                                                 : UVMF_INVLPG|UVMF_ALL;
++      if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
++              BUG();
++
++      balloon_unlock(flags);
++
++      return success ? 0 : -ENOMEM;
++}
++EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
++
++void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
++{
++      unsigned long *out_frames = discontig_frames, in_frame;
++      unsigned long  frame, flags;
++      unsigned int   i;
++      int            rc, success;
++      struct xen_memory_exchange exchange = {
++              .in = {
++                      .nr_extents   = 1,
++                      .extent_order = order,
++                      .domid        = DOMID_SELF
++              },
++              .out = {
++                      .nr_extents   = 1UL << order,
++                      .extent_order = 0,
++                      .domid        = DOMID_SELF
++              }
++      };
++
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              return;
++
++      if (unlikely(order > MAX_CONTIG_ORDER))
++              return;
++
++      set_xen_guest_handle(exchange.in.extent_start, &in_frame);
++      set_xen_guest_handle(exchange.out.extent_start, out_frames);
++
++      scrub_pages((void *)vstart, 1 << order);
++
++      balloon_lock(flags);
++
++      /* 1. Find start MFN of contiguous extent. */
++      in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
++
++      /* 2. Zap current PTEs. */
++      for (i = 0; i < (1U<<order); i++) {
++              MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
++                                      __pte_ma(0), 0);
++              set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
++                      INVALID_P2M_ENTRY);
++              out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
++      }
++      if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
++              BUG();
++
++      /* 3. Do the exchange for non-contiguous MFNs. */
++      rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
++      success = (exchange.nr_exchanged == 1);
++      BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
++      BUG_ON(success && (rc != 0));
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (unlikely(rc == -ENOSYS)) {
++              /* Compatibility when XENMEM_exchange is unsupported. */
++              if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
++                                       &exchange.in) != 1)
++                      BUG();
++              if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
++                                       &exchange.out) != (1UL << order))
++                      BUG();
++              success = 1;
++      }
++#endif
++
++      /* 4. Map new pages in place of old pages. */
++      for (i = 0; i < (1U<<order); i++) {
++              frame = success ? out_frames[i] : (in_frame + i);
++              MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
++                                      pfn_pte_ma(frame, PAGE_KERNEL), 0);
++              set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
++      }
++
++      cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
++                                                 ? UVMF_TLB_FLUSH|UVMF_ALL
++                                                 : UVMF_INVLPG|UVMF_ALL;
++      if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
++              BUG();
++
++      balloon_unlock(flags);
++
++      if (unlikely(!success)) {
++              /* Try hard to get the special memory back to Xen. */
++              exchange.in.extent_order = 0;
++              set_xen_guest_handle(exchange.in.extent_start, &in_frame);
++
++              for (i = 0; i < (1U<<order); i++) {
++                      struct page *page = alloc_page(__GFP_HIGHMEM|__GFP_COLD);
++                      unsigned long pfn;
++                      mmu_update_t mmu;
++                      unsigned int j = 0;
++
++                      if (!page) {
++                              pr_warn("Xen and kernel out of memory"
++                                      " while trying to release an order"
++                                      " %u contiguous region\n", order);
++                              break;
++                      }
++                      pfn = page_to_pfn(page);
++
++                      balloon_lock(flags);
++
++                      if (!PageHighMem(page)) {
++                              void *v = __va(pfn << PAGE_SHIFT);
++
++                              scrub_pages(v, 1);
++                              MULTI_update_va_mapping(cr_mcl + j, (unsigned long)v,
++                                                      __pte_ma(0), UVMF_INVLPG|UVMF_ALL);
++                              ++j;
++                      }
++#ifdef CONFIG_XEN_SCRUB_PAGES
++                      else {
++                              scrub_pages(kmap(page), 1);
++                              kunmap(page);
++                              kmap_flush_unused();
++                      }
++#endif
++
++                      frame = pfn_to_mfn(pfn);
++                      set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
++
++                      MULTI_update_va_mapping(cr_mcl + j, vstart,
++                                              pfn_pte_ma(frame, PAGE_KERNEL),
++                                              UVMF_INVLPG|UVMF_ALL);
++                      ++j;
++
++                      pfn = __pa(vstart) >> PAGE_SHIFT;
++                      set_phys_to_machine(pfn, frame);
++                      if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++                              mmu.ptr = ((uint64_t)frame << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
++                              mmu.val = pfn;
++                              cr_mcl[j].op = __HYPERVISOR_mmu_update;
++                              cr_mcl[j].args[0] = (unsigned long)&mmu;
++                              cr_mcl[j].args[1] = 1;
++                              cr_mcl[j].args[2] = 0;
++                              cr_mcl[j].args[3] = DOMID_SELF;
++                              ++j;
++                      }
++
++                      cr_mcl[j].op = __HYPERVISOR_memory_op;
++                      cr_mcl[j].args[0] = XENMEM_decrease_reservation;
++                      cr_mcl[j].args[1] = (unsigned long)&exchange.in;
++
++                      if (HYPERVISOR_multicall(cr_mcl, j + 1))
++                              BUG();
++                      BUG_ON(cr_mcl[j].result != 1);
++                      while (j--)
++                              BUG_ON(cr_mcl[j].result != 0);
++
++                      balloon_unlock(flags);
++
++                      free_empty_pages(&page, 1);
++
++                      in_frame++;
++                      vstart += PAGE_SIZE;
++              }
++      }
++}
++EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
++
++int __init early_create_contiguous_region(unsigned long pfn,
++                                        unsigned int order,
++                                        unsigned int address_bits)
++{
++      unsigned long *in_frames = discontig_frames, out_frame = pfn;
++      unsigned int i;
++      int rc, success;
++      struct xen_memory_exchange exchange = {
++              .in = {
++                      .nr_extents   = 1UL << order,
++                      .extent_order = 0,
++                      .domid        = DOMID_SELF
++              },
++              .out = {
++                      .nr_extents   = 1,
++                      .extent_order = order,
++                      .address_bits = address_bits,
++                      .domid        = DOMID_SELF
++              }
++      };
++
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              return 0;
++
++      if (unlikely(order > MAX_CONTIG_ORDER))
++              return -ENOMEM;
++
++      for (i = 0; i < (1U << order); ++i) {
++              in_frames[i] = pfn_to_mfn(pfn + i);
++              set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
++      }
++
++      set_xen_guest_handle(exchange.in.extent_start, in_frames);
++      set_xen_guest_handle(exchange.out.extent_start, &out_frame);
++
++      rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
++      success = (exchange.nr_exchanged == (1UL << order));
++      BUG_ON(!success && (exchange.nr_exchanged || !rc));
++      BUG_ON(success && rc);
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (unlikely(rc == -ENOSYS)) {
++              /* Compatibility when XENMEM_exchange is unavailable. */
++              if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
++                                       &exchange.in) != (1UL << order))
++                      BUG();
++              success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
++                                              &exchange.out) == 1);
++              if (!success) {
++                      for (i = 0; i < (1U << order); ++i)
++                              in_frames[i] = pfn + i;
++                      if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
++                                               &exchange.in) != (1UL << order))
++                              BUG();
++              }
++      }
++#endif
++
++      for (i = 0; i < (1U << order); ++i, ++out_frame) {
++              if (!success)
++                      out_frame = in_frames[i];
++              set_phys_to_machine(pfn + i, out_frame);
++      }
++
++      return success ? 0 : -ENOMEM;
++}
++
++static void undo_limit_pages(struct page *pages, unsigned int order)
++{
++      BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++      BUG_ON(order > MAX_CONTIG_ORDER);
++      xen_limit_pages_to_max_mfn(pages, order, 0);
++      ClearPageForeign(pages);
++      __free_pages(pages, order);
++}
++
++int xen_limit_pages_to_max_mfn(
++      struct page *pages, unsigned int order, unsigned int address_bits)
++{
++      unsigned long flags, frame;
++      unsigned long *in_frames = discontig_frames, *out_frames = limited_frames;
++      struct page *page;
++      unsigned int i, n, nr_mcl;
++      int rc, success;
++      DECLARE_BITMAP(limit_map, 1 << MAX_CONTIG_ORDER);
++
++      struct xen_memory_exchange exchange = {
++              .in = {
++                      .extent_order = 0,
++                      .domid        = DOMID_SELF
++              },
++              .out = {
++                      .extent_order = 0,
++                      .address_bits = address_bits,
++                      .domid        = DOMID_SELF
++              }
++      };
++
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              return 0;
++
++      if (unlikely(order > MAX_CONTIG_ORDER))
++              return -ENOMEM;
++
++      if (address_bits) {
++              if (address_bits < PAGE_SHIFT)
++                      return -EINVAL;
++              bitmap_zero(limit_map, 1U << order);
++      } else if (order) {
++              BUILD_BUG_ON(sizeof(pages->index) != sizeof(*limit_map));
++              for (i = 0; i < BITS_TO_LONGS(1U << order); ++i)
++                      limit_map[i] = pages[i + 1].index;
++      } else
++              __set_bit(0, limit_map);
++
++      set_xen_guest_handle(exchange.in.extent_start, in_frames);
++      set_xen_guest_handle(exchange.out.extent_start, out_frames);
++
++      /* 0. Scrub the pages. */
++      for (i = 0, n = 0; i < 1U<<order ; i++) {
++              page = &pages[i];
++              if (address_bits) {
++                      if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT)))
++                              continue;
++                      __set_bit(i, limit_map);
++              }
++
++              if (!PageHighMem(page))
++                      scrub_pages(page_address(page), 1);
++#ifdef CONFIG_XEN_SCRUB_PAGES
++              else {
++                      scrub_pages(kmap(page), 1);
++                      kunmap(page);
++                      ++n;
++              }
++#endif
++      }
++      if (bitmap_empty(limit_map, 1U << order))
++              return 0;
++
++      if (n)
++              kmap_flush_unused();
++
++      balloon_lock(flags);
++
++      /* 1. Zap current PTEs (if any), remembering MFNs. */
++      for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
++              if(!test_bit(i, limit_map))
++                      continue;
++              page = &pages[i];
++
++              out_frames[n] = page_to_pfn(page);
++              in_frames[n] = pfn_to_mfn(out_frames[n]);
++
++              if (!PageHighMem(page))
++                      MULTI_update_va_mapping(cr_mcl + nr_mcl++,
++                                              (unsigned long)page_address(page),
++                                              __pte_ma(0), 0);
++
++              set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY);
++              ++n;
++      }
++      if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
++              BUG();
++
++      /* 2. Get new memory below the required limit. */
++      exchange.in.nr_extents = n;
++      exchange.out.nr_extents = n;
++      rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
++      success = (exchange.nr_exchanged == n);
++      BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
++      BUG_ON(success && (rc != 0));
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (unlikely(rc == -ENOSYS)) {
++              /* Compatibility when XENMEM_exchange is unsupported. */
++              if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
++                                       &exchange.in) != n)
++                      BUG();
++              if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
++                                       &exchange.out) != n)
++                      BUG();
++              success = 1;
++      }
++#endif
++
++      /* 3. Map the new pages in place of old pages. */
++      for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
++              if(!test_bit(i, limit_map))
++                      continue;
++              page = &pages[i];
++
++              frame = success ? out_frames[n] : in_frames[n];
++
++              if (!PageHighMem(page))
++                      MULTI_update_va_mapping(cr_mcl + nr_mcl++,
++                                              (unsigned long)page_address(page),
++                                              pfn_pte_ma(frame, PAGE_KERNEL), 0);
++
++              set_phys_to_machine(page_to_pfn(page), frame);
++              ++n;
++      }
++      if (nr_mcl) {
++              cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order
++                                                              ? UVMF_TLB_FLUSH|UVMF_ALL
++                                                              : UVMF_INVLPG|UVMF_ALL;
++              if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
++                      BUG();
++      }
++
++      balloon_unlock(flags);
++
++      if (!success)
++              return -ENOMEM;
++
++      if (address_bits) {
++              if (order) {
++                      BUILD_BUG_ON(sizeof(*limit_map) != sizeof(pages->index));
++                      for (i = 0; i < BITS_TO_LONGS(1U << order); ++i)
++                              pages[i + 1].index = limit_map[i];
++              }
++              SetPageForeign(pages, undo_limit_pages);
++      }
++
++      return 0;
++}
++EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
++
++bool hypervisor_oom(void)
++{
++      WARN_ONCE(1, "Hypervisor is out of memory");
++      return false;//temp
++}
++
++int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
++                        void *arg, int (*func)(unsigned long, unsigned long,
++                                               void *))
++{
++      return start_pfn < max_pfn && nr_pages
++             ? func(start_pfn, min(max_pfn - start_pfn, nr_pages), arg)
++             : -1;
++}
++
++int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
++{
++      maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
++      return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
++}
++
++int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
++                  int type)
++{
++      maddr_t mach_gp = arbitrary_virt_to_machine(gdt + entry);
++      return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
++}
diff --cc arch/x86/mm/init-xen.c

index 0000000,0000000..64314c3

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/init-xen.c
@@@ -1,0 -1,0 +1,474 @@@
++#include <linux/gfp.h>
++#include <linux/initrd.h>
++#include <linux/ioport.h>
++#include <linux/swap.h>
++#include <linux/memblock.h>
++#include <linux/bootmem.h>
++
++#include <asm/cacheflush.h>
++#include <asm/e820.h>
++#include <asm/init.h>
++#include <asm/page.h>
++#include <asm/page_types.h>
++#include <asm/sections.h>
++#include <asm/setup.h>
++#include <asm/system.h>
++#include <asm/tlbflush.h>
++#include <asm/tlb.h>
++#include <asm/proto.h>
++
++DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
++
++unsigned long __meminitdata pgt_buf_start;
++unsigned long __meminitdata pgt_buf_end;
++unsigned long __meminitdata pgt_buf_top;
++
++int after_bootmem;
++
++#if !defined(CONFIG_XEN)
++int direct_gbpages
++#ifdef CONFIG_DIRECT_GBPAGES
++                              = 1
++#endif
++;
++#elif defined(CONFIG_X86_32)
++#define direct_gbpages 0
++extern unsigned long extend_init_mapping(unsigned long tables_space);
++#else
++extern void xen_finish_init_mapping(void);
++#endif
++
++static void __init find_early_table_space(unsigned long end, int use_pse,
++                                        int use_gbpages)
++{
++      unsigned long puds, pmds, ptes, tables;
++
++      puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
++      tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
++
++      if (use_gbpages) {
++              unsigned long extra;
++
++              extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
++              pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
++      } else
++              pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
++
++      tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
++
++      if (use_pse) {
++              unsigned long extra;
++
++              extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
++#ifdef CONFIG_X86_32
++              extra += PMD_SIZE;
++#endif
++              ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
++      } else
++              ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
++
++      tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
++
++#ifdef CONFIG_X86_32
++      /* for fixmap */
++      tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
++
++      pgt_buf_start = extend_init_mapping(tables);
++      pgt_buf_end = pgt_buf_start;
++#else /* CONFIG_X86_64 */
++      if (!pgt_buf_top) {
++              pgt_buf_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
++                      xen_start_info->nr_pt_frames;
++              pgt_buf_end = pgt_buf_start;
++      } else {
++              /*
++               * [table_start, table_top) gets passed to reserve_early(),
++               * so we must not use table_end here, despite continuing
++               * to allocate from there. table_end possibly being below
++               * table_start is otoh not a problem.
++               */
++              pgt_buf_start = pgt_buf_top;
++      }
++#endif
++      if (pgt_buf_start == -1UL)
++              panic("Cannot find space for the kernel page tables");
++
++      pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
++
++      printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
++              end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
++}
++
++void __init xen_pagetable_reserve(u64 start, u64 end)
++{
++      memblock_x86_reserve_range(start, end, "PGTABLE");
++}
++
++struct map_range {
++      unsigned long start;
++      unsigned long end;
++      unsigned page_size_mask;
++};
++
++#ifdef CONFIG_X86_32
++#define NR_RANGE_MR 3
++#else /* CONFIG_X86_64 */
++#define NR_RANGE_MR 5
++#endif
++
++static int __meminit save_mr(struct map_range *mr, int nr_range,
++                           unsigned long start_pfn, unsigned long end_pfn,
++                           unsigned long page_size_mask)
++{
++      if (start_pfn < end_pfn) {
++              if (nr_range >= NR_RANGE_MR)
++                      panic("run out of range for init_memory_mapping\n");
++              mr[nr_range].start = start_pfn<<PAGE_SHIFT;
++              mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
++              mr[nr_range].page_size_mask = page_size_mask;
++              nr_range++;
++      }
++
++      return nr_range;
++}
++
++/*
++ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
++ * This runs before bootmem is initialized and gets pages directly from
++ * the physical memory. To access them they are temporarily mapped.
++ */
++unsigned long __init_refok init_memory_mapping(unsigned long start,
++                                             unsigned long end)
++{
++      unsigned long page_size_mask = 0;
++      unsigned long start_pfn, end_pfn;
++      unsigned long ret = 0;
++      unsigned long pos;
++
++      struct map_range mr[NR_RANGE_MR];
++      int nr_range, i;
++      int use_pse, use_gbpages;
++
++      printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
++
++#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
++      /*
++       * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
++       * This will simplify cpa(), which otherwise needs to support splitting
++       * large pages into small in interrupt context, etc.
++       */
++      use_pse = use_gbpages = 0;
++#else
++      use_pse = cpu_has_pse;
++      use_gbpages = direct_gbpages;
++#endif
++
++      /* Enable PSE if available */
++      if (cpu_has_pse)
++              set_in_cr4(X86_CR4_PSE);
++
++      /* Enable PGE if available */
++      if (cpu_has_pge) {
++              set_in_cr4(X86_CR4_PGE);
++              __supported_pte_mask |= _PAGE_GLOBAL;
++      }
++
++      if (use_gbpages)
++              page_size_mask |= 1 << PG_LEVEL_1G;
++      if (use_pse)
++              page_size_mask |= 1 << PG_LEVEL_2M;
++
++      memset(mr, 0, sizeof(mr));
++      nr_range = 0;
++
++      /* head if not big page alignment ? */
++      start_pfn = start >> PAGE_SHIFT;
++      pos = start_pfn << PAGE_SHIFT;
++#ifdef CONFIG_X86_32
++      /*
++       * Don't use a large page for the first 2/4MB of memory
++       * because there are often fixed size MTRRs in there
++       * and overlapping MTRRs into large pages can cause
++       * slowdowns.
++       */
++      if (pos == 0)
++              end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
++      else
++              end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
++                               << (PMD_SHIFT - PAGE_SHIFT);
++#else /* CONFIG_X86_64 */
++      end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
++                      << (PMD_SHIFT - PAGE_SHIFT);
++#endif
++      if (end_pfn > (end >> PAGE_SHIFT))
++              end_pfn = end >> PAGE_SHIFT;
++      if (start_pfn < end_pfn) {
++              nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
++              pos = end_pfn << PAGE_SHIFT;
++      }
++
++      /* big page (2M) range */
++      start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
++                       << (PMD_SHIFT - PAGE_SHIFT);
++#ifdef CONFIG_X86_32
++      end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
++#else /* CONFIG_X86_64 */
++      end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
++                       << (PUD_SHIFT - PAGE_SHIFT);
++      if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
++              end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
++#endif
++
++      if (start_pfn < end_pfn) {
++              nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
++                              page_size_mask & (1<<PG_LEVEL_2M));
++              pos = end_pfn << PAGE_SHIFT;
++      }
++
++#ifdef CONFIG_X86_64
++      /* big page (1G) range */
++      start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
++                       << (PUD_SHIFT - PAGE_SHIFT);
++      end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
++      if (start_pfn < end_pfn) {
++              nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
++                              page_size_mask &
++                               ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
++              pos = end_pfn << PAGE_SHIFT;
++      }
++
++      /* tail is not big page (1G) alignment */
++      start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
++                       << (PMD_SHIFT - PAGE_SHIFT);
++      end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
++      if (start_pfn < end_pfn) {
++              nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
++                              page_size_mask & (1<<PG_LEVEL_2M));
++              pos = end_pfn << PAGE_SHIFT;
++      }
++#endif
++
++      /* tail is not big page (2M) alignment */
++      start_pfn = pos>>PAGE_SHIFT;
++      end_pfn = end>>PAGE_SHIFT;
++      nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
++
++      /* try to merge same page size and continuous */
++      for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
++              unsigned long old_start;
++              if (mr[i].end != mr[i+1].start ||
++                  mr[i].page_size_mask != mr[i+1].page_size_mask)
++                      continue;
++              /* move it */
++              old_start = mr[i].start;
++              memmove(&mr[i], &mr[i+1],
++                      (nr_range - 1 - i) * sizeof(struct map_range));
++              mr[i--].start = old_start;
++              nr_range--;
++      }
++
++      for (i = 0; i < nr_range; i++)
++              printk(KERN_DEBUG " %010lx - %010lx page %s\n",
++                              mr[i].start, mr[i].end,
++                      (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
++                       (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
++
++      /*
++       * Find space for the kernel direct mapping tables.
++       *
++       * Later we should allocate these tables in the local node of the
++       * memory mapped. Unfortunately this is done currently before the
++       * nodes are discovered.
++       */
++      if (!after_bootmem)
++              find_early_table_space(end, use_pse, use_gbpages);
++
++#ifdef CONFIG_X86_64
++#define addr_to_page(addr)                                            \
++      ((unsigned long *)                                              \
++       ((mfn_to_pfn(((addr) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)      \
++         << PAGE_SHIFT) + __START_KERNEL_map))
++
++      if (!start) {
++              unsigned long addr, va = __START_KERNEL_map;
++              unsigned long *page = (unsigned long *)init_level4_pgt;
++
++              /* Kill mapping of memory below _text. */
++              while (va < (unsigned long)&_text) {
++                      if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
++                              BUG();
++                      va += PAGE_SIZE;
++              }
++
++              /* Blow away any spurious initial mappings. */
++              va = __START_KERNEL_map + (pgt_buf_start << PAGE_SHIFT);
++
++              addr = page[pgd_index(va)];
++              page = addr_to_page(addr);
++              addr = page[pud_index(va)];
++              page = addr_to_page(addr);
++              while (pmd_index(va) | pte_index(va)) {
++                      if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
++                              break;
++                      if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
++                              BUG();
++                      va += PAGE_SIZE;
++              }
++      }
++#undef addr_to_page
++#endif
++
++      for (i = 0; i < nr_range; i++)
++              ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
++                                                 mr[i].page_size_mask);
++
++#ifdef CONFIG_X86_32
++      early_ioremap_page_table_range_init();
++#endif
++
++#ifdef CONFIG_X86_64
++      BUG_ON(pgt_buf_end > pgt_buf_top);
++      if (!start)
++              xen_finish_init_mapping();
++      else
++#endif
++      if (pgt_buf_end < pgt_buf_top)
++              /* Disable the 'table_end' allocator. */
++              pgt_buf_top = pgt_buf_end;
++
++      __flush_tlb_all();
++
++      /*
++       * Reserve the kernel pagetable pages we used (pgt_buf_start -
++       * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
++       * so that they can be reused for other purposes.
++       *
++       * On native it just means calling memblock_x86_reserve_range, on Xen it
++       * also means marking RW the pagetable pages that we allocated before
++       * but that haven't been used.
++       *
++       * In fact on xen we mark RO the whole range pgt_buf_start -
++       * pgt_buf_top, because we have to make sure that when
++       * init_memory_mapping reaches the pagetable pages area, it maps
++       * RO all the pagetable pages, including the ones that are beyond
++       * pgt_buf_end at that time.
++       */
++      if (!after_bootmem && pgt_buf_top > pgt_buf_start) {
++#ifdef CONFIG_X86_64
++              reserve_pgtable_low();
++#endif
++              x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
++                              PFN_PHYS(pgt_buf_top));
++      }
++
++      if (!after_bootmem)
++              early_memtest(start, end);
++
++      return ret >> PAGE_SHIFT;
++}
++
++
++/*
++ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
++ * is valid. The argument is a physical page number.
++ *
++ *
++ * On x86, access has to be given to the first megabyte of ram because that area
++ * contains bios code and data regions used by X and dosemu and similar apps.
++ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
++ * mmio resources as well as potential bios/acpi data regions.
++ */
++int devmem_is_allowed(unsigned long pagenr)
++{
++      if (pagenr <= 256)
++              return 1;
++      if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
++              return 0;
++      if (mfn_to_local_pfn(pagenr) >= max_pfn)
++              return 1;
++      return 0;
++}
++
++void free_init_pages(char *what, unsigned long begin, unsigned long end)
++{
++      unsigned long addr;
++      unsigned long begin_aligned, end_aligned;
++
++      /* Make sure boundaries are page aligned */
++      begin_aligned = PAGE_ALIGN(begin);
++      end_aligned   = end & PAGE_MASK;
++
++      if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
++              begin = begin_aligned;
++              end   = end_aligned;
++      }
++
++      if (begin >= end)
++              return;
++
++      addr = begin;
++
++      /*
++       * If debugging page accesses then do not free this memory but
++       * mark them not present - any buggy init-section access will
++       * create a kernel page fault:
++       */
++#ifdef CONFIG_DEBUG_PAGEALLOC
++      printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
++              begin, end);
++      set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
++#else
++      /*
++       * We just marked the kernel text read only above, now that
++       * we are going to free part of that, we need to make that
++       * writeable and non-executable first.
++       */
++      set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
++      set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
++
++      printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
++
++      for (; addr < end; addr += PAGE_SIZE) {
++              ClearPageReserved(virt_to_page(addr));
++              init_page_count(virt_to_page(addr));
++              memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
++#ifdef CONFIG_X86_64
++              if (addr >= __START_KERNEL_map) {
++                      /* make_readonly() reports all kernel addresses. */
++                      if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
++                                                       pfn_pte(__pa(addr) >> PAGE_SHIFT,
++                                                               PAGE_KERNEL),
++                                                       0))
++                              BUG();
++                      if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
++                              BUG();
++              }
++#endif
++              free_page(addr);
++              totalram_pages++;
++      }
++#endif
++}
++
++void free_initmem(void)
++{
++      free_init_pages("unused kernel memory",
++                      (unsigned long)(&__init_begin),
++                      (unsigned long)(&__init_end));
++}
++
++#ifdef CONFIG_BLK_DEV_INITRD
++void free_initrd_mem(unsigned long start, unsigned long end)
++{
++      /*
++       * end could be not aligned, and We can not align that,
++       * decompresser could be confused by aligned initrd_end
++       * We already reserve the end partial page before in
++       *   - i386_start_kernel()
++       *   - x86_64_start_kernel()
++       *   - relocate_initrd()
++       * So here We can do PAGE_ALIGN() safely to get partial page to be freed
++       */
++      free_init_pages("initrd memory", start, PAGE_ALIGN(end));
++}
++#endif
diff --cc arch/x86/mm/init_32-xen.c

index 0000000,0000000..c8aaffc

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/init_32-xen.c
@@@ -1,0 -1,0 +1,1038 @@@
++/*
++ *
++ *  Copyright (C) 1995  Linus Torvalds
++ *
++ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
++ */
++
++#include <linux/module.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/string.h>
++#include <linux/types.h>
++#include <linux/ptrace.h>
++#include <linux/mman.h>
++#include <linux/mm.h>
++#include <linux/hugetlb.h>
++#include <linux/swap.h>
++#include <linux/smp.h>
++#include <linux/init.h>
++#include <linux/highmem.h>
++#include <linux/pagemap.h>
++#include <linux/pci.h>
++#include <linux/pfn.h>
++#include <linux/poison.h>
++#include <linux/bootmem.h>
++#include <linux/memblock.h>
++#include <linux/proc_fs.h>
++#include <linux/memory_hotplug.h>
++#include <linux/initrd.h>
++#include <linux/cpumask.h>
++#include <linux/gfp.h>
++#include <linux/dma-mapping.h>
++#include <linux/scatterlist.h>
++
++#include <asm/asm.h>
++#include <asm/bios_ebda.h>
++#include <asm/processor.h>
++#include <asm/system.h>
++#include <asm/uaccess.h>
++#include <asm/pgtable.h>
++#include <asm/dma.h>
++#include <asm/fixmap.h>
++#include <asm/e820.h>
++#include <asm/apic.h>
++#include <asm/bugs.h>
++#include <asm/tlb.h>
++#include <asm/tlbflush.h>
++#include <asm/olpc_ofw.h>
++#include <asm/pgalloc.h>
++#include <asm/sections.h>
++#include <asm/hypervisor.h>
++#include <asm/swiotlb.h>
++#include <asm/setup.h>
++#include <asm/cacheflush.h>
++#include <asm/page_types.h>
++#include <asm/init.h>
++
++unsigned long highstart_pfn, highend_pfn;
++
++static noinline int do_test_wp_bit(void);
++
++bool __read_mostly __vmalloc_start_set = false;
++
++static __init void *alloc_low_page(void)
++{
++      unsigned long pfn = pgt_buf_end++;
++      void *adr;
++
++      if (pfn >= pgt_buf_top)
++              panic("alloc_low_page: ran out of memory");
++
++      adr = __va(pfn * PAGE_SIZE);
++      clear_page(adr);
++      return adr;
++}
++
++/*
++ * Creates a middle page table and puts a pointer to it in the
++ * given global directory entry. This only returns the gd entry
++ * in non-PAE compilation mode, since the middle layer is folded.
++ */
++static pmd_t * __init one_md_table_init(pgd_t *pgd)
++{
++      pud_t *pud;
++      pmd_t *pmd_table;
++
++#ifdef CONFIG_X86_PAE
++      if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
++              if (after_bootmem)
++                      pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
++              else
++                      pmd_table = (pmd_t *)alloc_low_page();
++              paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
++              make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
++              set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
++              pud = pud_offset(pgd, 0);
++              BUG_ON(pmd_table != pmd_offset(pud, 0));
++
++              return pmd_table;
++      }
++#endif
++      pud = pud_offset(pgd, 0);
++      pmd_table = pmd_offset(pud, 0);
++
++      return pmd_table;
++}
++
++/*
++ * Create a page table and place a pointer to it in a middle page
++ * directory entry:
++ */
++static pte_t * __init one_page_table_init(pmd_t *pmd)
++{
++#if CONFIG_XEN_COMPAT <= 0x030002
++      if (pmd_none(*pmd)) {
++#else
++      if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) {
++#endif
++              pte_t *page_table = NULL;
++
++              if (after_bootmem) {
++#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
++                      page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
++#endif
++                      if (!page_table)
++                              page_table =
++                              (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
++              } else
++                      page_table = (pte_t *)alloc_low_page();
++
++              paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
++              make_lowmem_page_readonly(page_table,
++                                        XENFEAT_writable_page_tables);
++              set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
++              BUG_ON(page_table != pte_offset_kernel(pmd, 0));
++      }
++
++      return pte_offset_kernel(pmd, 0);
++}
++
++pmd_t * __init populate_extra_pmd(unsigned long vaddr)
++{
++      int pgd_idx = pgd_index(vaddr);
++      int pmd_idx = pmd_index(vaddr);
++
++      return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
++}
++
++pte_t * __init populate_extra_pte(unsigned long vaddr)
++{
++      int pte_idx = pte_index(vaddr);
++      pmd_t *pmd;
++
++      pmd = populate_extra_pmd(vaddr);
++      return one_page_table_init(pmd) + pte_idx;
++}
++
++static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
++                                         unsigned long vaddr, pte_t *lastpte)
++{
++#ifdef CONFIG_HIGHMEM
++      /*
++       * Something (early fixmap) may already have put a pte
++       * page here, which causes the page table allocation
++       * to become nonlinear. Attempt to fix it, and if it
++       * is still nonlinear then we have to bug.
++       */
++      int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
++      int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
++
++      if (pmd_idx_kmap_begin != pmd_idx_kmap_end
++          && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
++          && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
++          && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
++              || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
++              pte_t *newpte;
++              int i;
++
++              BUG_ON(after_bootmem);
++              newpte = alloc_low_page();
++              for (i = 0; i < PTRS_PER_PTE; i++)
++                      set_pte(newpte + i, pte[i]);
++
++              paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
++              make_lowmem_page_readonly(newpte,
++                                        XENFEAT_writable_page_tables);
++              set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
++              BUG_ON(newpte != pte_offset_kernel(pmd, 0));
++              __flush_tlb_all();
++
++              paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
++              make_lowmem_page_writable(pte,
++                                        XENFEAT_writable_page_tables);
++              pte = newpte;
++      }
++      BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
++             && vaddr > fix_to_virt(FIX_KMAP_END)
++             && lastpte && lastpte + PTRS_PER_PTE != pte);
++#endif
++      return pte;
++}
++
++/*
++ * This function initializes a certain range of kernel virtual memory
++ * with new bootmem page tables, everywhere page tables are missing in
++ * the given range.
++ *
++ * NOTE: The pagetables are allocated contiguous on the physical space
++ * so we can cache the place of the first one and move around without
++ * checking the pgd every time.
++ */
++static void __init
++page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
++{
++      int pgd_idx, pmd_idx;
++      unsigned long vaddr;
++      pgd_t *pgd;
++      pmd_t *pmd;
++      pte_t *pte = NULL;
++
++      vaddr = start;
++      pgd_idx = pgd_index(vaddr);
++      pmd_idx = pmd_index(vaddr);
++      pgd = pgd_base + pgd_idx;
++
++      for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
++              pmd = one_md_table_init(pgd);
++              pmd = pmd + pmd_index(vaddr);
++              for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
++                                                      pmd++, pmd_idx++) {
++                      if (vaddr >= hypervisor_virt_start)
++                              break;
++                      pte = page_table_kmap_check(one_page_table_init(pmd),
++                                                  pmd, vaddr, pte);
++
++                      vaddr += PMD_SIZE;
++              }
++              pmd_idx = 0;
++      }
++}
++
++static inline int is_kernel_text(unsigned long addr)
++{
++      if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
++              return 1;
++      return 0;
++}
++
++/*
++ * This maps the physical memory to kernel virtual address space, a total
++ * of max_low_pfn pages, by creating page tables starting from address
++ * PAGE_OFFSET:
++ */
++unsigned long __init
++kernel_physical_mapping_init(unsigned long start,
++                           unsigned long end,
++                           unsigned long page_size_mask)
++{
++      int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
++      unsigned long last_map_addr = end;
++      unsigned long start_pfn, end_pfn;
++      pgd_t *pgd_base = swapper_pg_dir;
++      int pgd_idx, pmd_idx, pte_ofs;
++      unsigned long pfn;
++      pgd_t *pgd;
++      pmd_t *pmd;
++      pte_t *pte;
++      unsigned pages_2m, pages_4k;
++      int mapping_iter;
++
++      start_pfn = start >> PAGE_SHIFT;
++      end_pfn = end >> PAGE_SHIFT;
++
++      /*
++       * First iteration will setup identity mapping using large/small pages
++       * based on use_pse, with other attributes same as set by
++       * the early code in head_32.S
++       *
++       * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
++       * as desired for the kernel identity mapping.
++       *
++       * This two pass mechanism conforms to the TLB app note which says:
++       *
++       *     "Software should not write to a paging-structure entry in a way
++       *      that would change, for any linear address, both the page size
++       *      and either the page frame or attributes."
++       */
++      mapping_iter = 1;
++
++      if (!cpu_has_pse) {
++              use_pse = 0;
++              mapping_iter = 0;
++      }
++
++repeat:
++      pages_2m = pages_4k = 0;
++      pfn = start_pfn;
++      pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
++      pgd = pgd_base + pgd_idx;
++      for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
++#ifdef CONFIG_XEN
++              /*
++               * Native linux hasn't PAE-paging enabled yet at this
++               * point.  When running as xen domain we are in PAE
++               * mode already, thus we can't simply hook a empty
++               * pmd.  That would kill the mappings we are currently
++               * using ...
++               */
++              pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
++#else
++              pmd = one_md_table_init(pgd);
++#endif
++
++              if (pfn >= end_pfn)
++                      continue;
++#ifdef CONFIG_X86_PAE
++              pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
++              pmd += pmd_idx;
++#else
++              pmd_idx = 0;
++#endif
++              for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
++                   pmd++, pmd_idx++) {
++                      unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
++
++                      if (addr >= hypervisor_virt_start)
++                              continue;
++
++                      /*
++                       * Map with big pages if possible, otherwise
++                       * create normal page tables:
++                       */
++                      if (use_pse) {
++                              unsigned int addr2;
++                              pgprot_t prot = PAGE_KERNEL_LARGE;
++                              /*
++                               * first pass will use the same initial
++                               * identity mapping attribute + _PAGE_PSE.
++                               */
++                              pgprot_t init_prot =
++                                      __pgprot(PTE_IDENT_ATTR |
++                                               _PAGE_PSE);
++
++                              addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
++                                      PAGE_OFFSET + PAGE_SIZE-1;
++
++                              if (is_kernel_text(addr) ||
++                                  is_kernel_text(addr2))
++                                      prot = PAGE_KERNEL_LARGE_EXEC;
++
++                              pages_2m++;
++                              if (mapping_iter == 1)
++                                      set_pmd(pmd, pfn_pmd(pfn, init_prot));
++                              else
++                                      set_pmd(pmd, pfn_pmd(pfn, prot));
++
++                              pfn += PTRS_PER_PTE;
++                              continue;
++                      }
++                      pte = one_page_table_init(pmd);
++
++                      pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
++                      pte += pte_ofs;
++                      for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
++                           pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
++                              pgprot_t prot = PAGE_KERNEL;
++                              /*
++                               * first pass will use the same initial
++                               * identity mapping attribute.
++                               */
++                              pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
++
++                              /* XEN: Only map initial RAM allocation. */
++                              if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
++                                      continue;
++                              if (is_kernel_text(addr))
++                                      prot = PAGE_KERNEL_EXEC;
++
++                              pages_4k++;
++                              if (mapping_iter == 1) {
++                                      set_pte(pte, pfn_pte(pfn, init_prot));
++                                      last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
++                              } else
++                                      set_pte(pte, pfn_pte(pfn, prot));
++                      }
++              }
++      }
++      if (mapping_iter <= 1) {
++              /*
++               * update direct mapping page count only in the first
++               * iteration.
++               */
++              update_page_count(PG_LEVEL_2M, pages_2m);
++              update_page_count(PG_LEVEL_4K, pages_4k);
++      }
++      if (mapping_iter == 1) {
++              /*
++               * local global flush tlb, which will flush the previous
++               * mappings present in both small and large page TLB's.
++               */
++              __flush_tlb_all();
++
++              /*
++               * Second iteration will set the actual desired PTE attributes.
++               */
++              mapping_iter = 2;
++              goto repeat;
++      }
++      return last_map_addr;
++}
++
++pte_t *kmap_pte;
++pgprot_t kmap_prot;
++
++static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
++{
++      return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
++                      vaddr), vaddr), vaddr);
++}
++
++static void __init kmap_init(void)
++{
++      unsigned long kmap_vstart;
++
++      /*
++       * Cache the first kmap pte:
++       */
++      kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
++      kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
++
++      kmap_prot = PAGE_KERNEL;
++}
++
++#ifdef CONFIG_HIGHMEM
++static void __init permanent_kmaps_init(pgd_t *pgd_base)
++{
++      unsigned long vaddr;
++      pgd_t *pgd;
++      pud_t *pud;
++      pmd_t *pmd;
++      pte_t *pte;
++
++      vaddr = PKMAP_BASE;
++      page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
++
++      pgd = swapper_pg_dir + pgd_index(vaddr);
++      pud = pud_offset(pgd, vaddr);
++      pmd = pmd_offset(pud, vaddr);
++      pte = pte_offset_kernel(pmd, vaddr);
++      pkmap_page_table = pte;
++}
++
++static void __init add_one_highpage_init(struct page *page)
++{
++      ClearPageReserved(page);
++      init_page_count(page);
++      __free_page(page);
++      totalhigh_pages++;
++}
++
++void __init add_highpages_with_active_regions(int nid,
++                       unsigned long start_pfn, unsigned long end_pfn)
++{
++      struct range *range;
++      int nr_range;
++      int i;
++
++      nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn);
++
++      for (i = 0; i < nr_range; i++) {
++              struct page *page;
++              int node_pfn;
++
++              for (node_pfn = range[i].start; node_pfn < range[i].end;
++                   node_pfn++) {
++                      if (!pfn_valid(node_pfn))
++                              continue;
++                      page = pfn_to_page(node_pfn);
++                      add_one_highpage_init(page);
++              }
++      }
++}
++#else
++static inline void permanent_kmaps_init(pgd_t *pgd_base)
++{
++}
++#endif /* CONFIG_HIGHMEM */
++
++pgd_t *swapper_pg_dir;
++
++/*
++ * Build a proper pagetable for the kernel mappings.  Up until this
++ * point, we've been running on some set of pagetables constructed by
++ * the boot process.
++ *
++ * If we're booting on native hardware, this will be a pagetable
++ * constructed in arch/x86/kernel/head_32.S.  The root of the
++ * pagetable will be swapper_pg_dir.
++ *
++ * If we're booting paravirtualized under a hypervisor, then there are
++ * more options: we may already be running PAE, and the pagetable may
++ * or may not be based in swapper_pg_dir.  In any case,
++ * paravirt_pagetable_setup_start() will set up swapper_pg_dir
++ * appropriately for the rest of the initialization to work.
++ *
++ * In general, pagetable_init() assumes that the pagetable may already
++ * be partially populated, and so it avoids stomping on any existing
++ * mappings.
++ */
++void __init early_ioremap_page_table_range_init(void)
++{
++      pgd_t *pgd_base = swapper_pg_dir;
++      unsigned long vaddr, end;
++
++      /*
++       * Fixed mappings, only the page table structure has to be
++       * created - mappings will be set by set_fixmap():
++       */
++      vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
++      end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
++      page_table_range_init(vaddr, end, pgd_base);
++      early_ioremap_reset();
++}
++
++static void __init pagetable_init(void)
++{
++      pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
++
++      permanent_kmaps_init(pgd_base);
++}
++
++pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
++EXPORT_SYMBOL_GPL(__supported_pte_mask);
++
++/* user-defined highmem size */
++static unsigned int highmem_pages = -1;
++
++/*
++ * highmem=size forces highmem to be exactly 'size' bytes.
++ * This works even on boxes that have no highmem otherwise.
++ * This also works to reduce highmem size on bigger boxes.
++ */
++static int __init parse_highmem(char *arg)
++{
++      if (!arg)
++              return -EINVAL;
++
++      highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
++      return 0;
++}
++early_param("highmem", parse_highmem);
++
++#define MSG_HIGHMEM_TOO_BIG \
++      "highmem size (%luMB) is bigger than pages available (%luMB)!\n"
++
++#define MSG_LOWMEM_TOO_SMALL \
++      "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
++/*
++ * All of RAM fits into lowmem - but if user wants highmem
++ * artificially via the highmem=x boot parameter then create
++ * it:
++ */
++void __init lowmem_pfn_init(void)
++{
++      /* max_low_pfn is 0, we already have early_res support */
++      max_low_pfn = max_pfn;
++
++      if (highmem_pages == -1)
++              highmem_pages = 0;
++#ifdef CONFIG_HIGHMEM
++      if (highmem_pages >= max_pfn) {
++              printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
++                      pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
++              highmem_pages = 0;
++      }
++      if (highmem_pages) {
++              if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
++                      printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
++                              pages_to_mb(highmem_pages));
++                      highmem_pages = 0;
++              }
++              max_low_pfn -= highmem_pages;
++      }
++#else
++      if (highmem_pages)
++              printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
++#endif
++}
++
++#define MSG_HIGHMEM_TOO_SMALL \
++      "only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
++
++#define MSG_HIGHMEM_TRIMMED \
++      "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
++/*
++ * We have more RAM than fits into lowmem - we try to put it into
++ * highmem, also taking the highmem=x boot parameter into account:
++ */
++void __init highmem_pfn_init(void)
++{
++      max_low_pfn = MAXMEM_PFN;
++
++      if (highmem_pages == -1)
++              highmem_pages = max_pfn - MAXMEM_PFN;
++
++      if (highmem_pages + MAXMEM_PFN < max_pfn)
++              max_pfn = MAXMEM_PFN + highmem_pages;
++
++      if (highmem_pages + MAXMEM_PFN > max_pfn) {
++              printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
++                      pages_to_mb(max_pfn - MAXMEM_PFN),
++                      pages_to_mb(highmem_pages));
++              highmem_pages = 0;
++      }
++#ifndef CONFIG_HIGHMEM
++      /* Maximum memory usable is what is directly addressable */
++      printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
++      if (max_pfn > MAX_NONPAE_PFN)
++              printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
++      else
++              printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
++      max_pfn = MAXMEM_PFN;
++#else /* !CONFIG_HIGHMEM */
++#ifndef CONFIG_HIGHMEM64G
++      if (max_pfn > MAX_NONPAE_PFN) {
++              max_pfn = MAX_NONPAE_PFN;
++              printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
++      }
++#endif /* !CONFIG_HIGHMEM64G */
++#endif /* !CONFIG_HIGHMEM */
++}
++
++/*
++ * Determine low and high memory ranges:
++ */
++void __init find_low_pfn_range(void)
++{
++      /* it could update max_pfn */
++
++      if (max_pfn <= MAXMEM_PFN)
++              lowmem_pfn_init();
++      else
++              highmem_pfn_init();
++}
++
++#ifndef CONFIG_NEED_MULTIPLE_NODES
++void __init initmem_init(void)
++{
++#ifdef CONFIG_HIGHMEM
++      highstart_pfn = highend_pfn = max_pfn;
++      if (max_pfn > max_low_pfn)
++              highstart_pfn = max_low_pfn;
++      memblock_x86_register_active_regions(0, 0, highend_pfn);
++      sparse_memory_present_with_active_regions(0);
++      printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
++              pages_to_mb(highend_pfn - highstart_pfn));
++      num_physpages = highend_pfn;
++      high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
++#else
++      memblock_x86_register_active_regions(0, 0, max_low_pfn);
++      sparse_memory_present_with_active_regions(0);
++      num_physpages = max_low_pfn;
++      high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
++#endif
++#ifdef CONFIG_FLATMEM
++      max_mapnr = num_physpages;
++#endif
++      __vmalloc_start_set = true;
++
++      printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
++                      pages_to_mb(max_low_pfn));
++
++      setup_bootmem_allocator();
++}
++#endif /* !CONFIG_NEED_MULTIPLE_NODES */
++
++static void __init zone_sizes_init(void)
++{
++      unsigned long max_zone_pfns[MAX_NR_ZONES];
++      memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
++      max_zone_pfns[ZONE_DMA] =
++              virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
++      max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
++#ifdef CONFIG_HIGHMEM
++      max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
++#endif
++
++      free_area_init_nodes(max_zone_pfns);
++
++      xen_init_pgd_pin();
++}
++
++void __init setup_bootmem_allocator(void)
++{
++#ifdef CONFIG_XEN
++      if (max_low_pfn > xen_start_info->nr_pages)
++              memblock_x86_reserve_range(xen_start_info->nr_pages << PAGE_SHIFT,
++                                         max_low_pfn << PAGE_SHIFT, "BALLOON");
++#endif
++
++      printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
++               max_pfn_mapped<<PAGE_SHIFT);
++      printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
++
++      after_bootmem = 1;
++}
++
++unsigned long __init extend_init_mapping(unsigned long tables_space)
++{
++      unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
++                                + xen_start_info->nr_pt_frames;
++      unsigned long start = start_pfn, va = (unsigned long)&_text;
++      pgd_t *pgd;
++      pud_t *pud;
++      pmd_t *pmd;
++      pte_t *pte;
++
++      /* Ensure init mappings cover kernel text/data and initial tables. */
++      while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
++              pgd = pgd_offset_k(va);
++              pud = pud_offset(pgd, va);
++              pmd = pmd_offset(pud, va);
++              if (pmd_none(*pmd)) {
++                      unsigned long pa = start_pfn++ << PAGE_SHIFT;
++
++                      clear_page(__va(pa));
++                      make_lowmem_page_readonly(__va(pa),
++                                                XENFEAT_writable_page_tables);
++                      xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
++              }
++              pte = pte_offset_kernel(pmd, va);
++              if (pte_none(*pte)) {
++                      pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
++
++                      if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
++                              BUG();
++              }
++              va += PAGE_SIZE;
++      }
++
++      /* Finally, blow away any spurious initial mappings. */
++      while (1) {
++              pgd = pgd_offset_k(va);
++              pud = pud_offset(pgd, va);
++              pmd = pmd_offset(pud, va);
++              if (pmd_none(*pmd))
++                      break;
++              if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
++                      BUG();
++              va += PAGE_SIZE;
++      }
++
++      if (start_pfn > start)
++              memblock_x86_reserve_range(start << PAGE_SHIFT,
++                                         start_pfn << PAGE_SHIFT, "INITMAP");
++
++      return start_pfn;
++}
++
++/*
++ * paging_init() sets up the page tables - note that the first 8MB are
++ * already mapped by head.S.
++ *
++ * This routines also unmaps the page at virtual kernel address 0, so
++ * that we can trap those pesky NULL-reference errors in the kernel.
++ */
++void __init paging_init(void)
++{
++      pagetable_init();
++
++      __flush_tlb_all();
++
++      kmap_init();
++
++      /*
++       * NOTE: at this point the bootmem allocator is fully available.
++       */
++      olpc_dt_build_devicetree();
++      sparse_init();
++      zone_sizes_init();
++}
++
++/*
++ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
++ * and also on some strange 486's. All 586+'s are OK. This used to involve
++ * black magic jumps to work around some nasty CPU bugs, but fortunately the
++ * switch to using exceptions got rid of all that.
++ */
++static void __init test_wp_bit(void)
++{
++      printk(KERN_INFO
++  "Checking if this processor honours the WP bit even in supervisor mode...");
++
++      /* Any page-aligned address will do, the test is non-destructive */
++      __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
++      boot_cpu_data.wp_works_ok = do_test_wp_bit();
++      clear_fixmap(FIX_WP_TEST);
++
++      if (!boot_cpu_data.wp_works_ok) {
++              printk(KERN_CONT "No.\n");
++#ifdef CONFIG_X86_WP_WORKS_OK
++              panic(
++  "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
++#endif
++      } else {
++              printk(KERN_CONT "Ok.\n");
++      }
++}
++
++void __init mem_init(void)
++{
++      int codesize, reservedpages, datasize, initsize;
++      int tmp;
++      unsigned long pfn;
++
++      pci_iommu_alloc();
++
++#ifdef CONFIG_FLATMEM
++      BUG_ON(!mem_map);
++#endif
++      /* this will put all low memory onto the freelists */
++      totalram_pages += free_all_bootmem();
++      /* XEN: init low-mem pages outside initial allocation. */
++      for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
++              ClearPageReserved(pfn_to_page(pfn));
++              init_page_count(pfn_to_page(pfn));
++      }
++
++      reservedpages = 0;
++      for (tmp = 0; tmp < max_low_pfn; tmp++)
++              /*
++               * Only count reserved RAM pages:
++               */
++              if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
++                      reservedpages++;
++
++      set_highmem_pages_init();
++
++      codesize =  (unsigned long) &_etext - (unsigned long) &_text;
++      datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
++      initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
++
++      printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
++                      "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
++              nr_free_pages() << (PAGE_SHIFT-10),
++              num_physpages << (PAGE_SHIFT-10),
++              codesize >> 10,
++              reservedpages << (PAGE_SHIFT-10),
++              datasize >> 10,
++              initsize >> 10,
++              totalhigh_pages << (PAGE_SHIFT-10));
++
++      printk(KERN_INFO "virtual kernel memory layout:\n"
++              "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
++#ifdef CONFIG_HIGHMEM
++              "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
++#endif
++              "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
++              "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
++              "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
++              "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
++              "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
++              FIXADDR_START, FIXADDR_TOP,
++              (FIXADDR_TOP - FIXADDR_START) >> 10,
++
++#ifdef CONFIG_HIGHMEM
++              PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
++              (LAST_PKMAP*PAGE_SIZE) >> 10,
++#endif
++
++              VMALLOC_START, VMALLOC_END,
++              (VMALLOC_END - VMALLOC_START) >> 20,
++
++              (unsigned long)__va(0), (unsigned long)high_memory,
++              ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
++
++              (unsigned long)&__init_begin, (unsigned long)&__init_end,
++              ((unsigned long)&__init_end -
++               (unsigned long)&__init_begin) >> 10,
++
++              (unsigned long)&_etext, (unsigned long)&_edata,
++              ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
++
++              (unsigned long)&_text, (unsigned long)&_etext,
++              ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
++
++      /*
++       * Check boundaries twice: Some fundamental inconsistencies can
++       * be detected at build time already.
++       */
++#define __FIXADDR_TOP (-PAGE_SIZE)
++#ifdef CONFIG_HIGHMEM
++      BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE  > FIXADDR_START);
++      BUILD_BUG_ON(VMALLOC_END                        > PKMAP_BASE);
++#endif
++#define high_memory (-128UL << 20)
++      BUILD_BUG_ON(VMALLOC_START                      >= VMALLOC_END);
++#undef high_memory
++#undef __FIXADDR_TOP
++
++#ifdef CONFIG_HIGHMEM
++      BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE        > FIXADDR_START);
++      BUG_ON(VMALLOC_END                              > PKMAP_BASE);
++#endif
++      BUG_ON(VMALLOC_START                            >= VMALLOC_END);
++      BUG_ON((unsigned long)high_memory               > VMALLOC_START);
++
++      if (boot_cpu_data.wp_works_ok < 0)
++              test_wp_bit();
++}
++
++#ifdef CONFIG_MEMORY_HOTPLUG
++int arch_add_memory(int nid, u64 start, u64 size)
++{
++      struct pglist_data *pgdata = NODE_DATA(nid);
++      struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
++      unsigned long start_pfn = start >> PAGE_SHIFT;
++      unsigned long nr_pages = size >> PAGE_SHIFT;
++
++      return __add_pages(nid, zone, start_pfn, nr_pages);
++}
++#endif
++
++/*
++ * This function cannot be __init, since exceptions don't work in that
++ * section.  Put this after the callers, so that it cannot be inlined.
++ */
++static noinline int do_test_wp_bit(void)
++{
++      char tmp_reg;
++      int flag;
++
++      __asm__ __volatile__(
++              "       movb %0, %1     \n"
++              "1:     movb %1, %0     \n"
++              "       xorl %2, %2     \n"
++              "2:                     \n"
++              _ASM_EXTABLE(1b,2b)
++              :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
++               "=q" (tmp_reg),
++               "=r" (flag)
++              :"2" (1)
++              :"memory");
++
++      return flag;
++}
++
++#ifdef CONFIG_DEBUG_RODATA
++const int rodata_test_data = 0xC3;
++EXPORT_SYMBOL_GPL(rodata_test_data);
++
++int kernel_set_to_readonly __read_mostly;
++
++void set_kernel_text_rw(void)
++{
++      unsigned long start = PFN_ALIGN(_text);
++      unsigned long size = PFN_ALIGN(_etext) - start;
++
++      if (!kernel_set_to_readonly)
++              return;
++
++      pr_debug("Set kernel text: %lx - %lx for read write\n",
++               start, start+size);
++
++      set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
++}
++
++void set_kernel_text_ro(void)
++{
++      unsigned long start = PFN_ALIGN(_text);
++      unsigned long size = PFN_ALIGN(_etext) - start;
++
++      if (!kernel_set_to_readonly)
++              return;
++
++      pr_debug("Set kernel text: %lx - %lx for read only\n",
++               start, start+size);
++
++      set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
++}
++
++static void mark_nxdata_nx(void)
++{
++      /*
++       * When this called, init has already been executed and released,
++       * so everything past _etext should be NX.
++       */
++      unsigned long start = PFN_ALIGN(_etext);
++      /*
++       * This comes from is_kernel_text upper limit. Also HPAGE where used:
++       */
++      unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
++
++      if (__supported_pte_mask & _PAGE_NX)
++              printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
++      set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
++}
++
++void mark_rodata_ro(void)
++{
++      unsigned long start = PFN_ALIGN(_text);
++      unsigned long size = PFN_ALIGN(_etext) - start;
++
++      set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
++      printk(KERN_INFO "Write protecting the kernel text: %luk\n",
++              size >> 10);
++
++      kernel_set_to_readonly = 1;
++
++#ifdef CONFIG_CPA_DEBUG
++      printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
++              start, start+size);
++      set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
++
++      printk(KERN_INFO "Testing CPA: write protecting again\n");
++      set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
++#endif
++
++      start += size;
++      size = (unsigned long)__end_rodata - start;
++      set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
++      printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
++              size >> 10);
++      rodata_test();
++
++#ifdef CONFIG_CPA_DEBUG
++      printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
++      set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
++
++      printk(KERN_INFO "Testing CPA: write protecting again\n");
++      set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
++#endif
++      mark_nxdata_nx();
++}
++#endif
++
diff --cc arch/x86/mm/init_64-xen.c

index 0000000,0000000..523f071

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/init_64-xen.c
@@@ -1,0 -1,0 +1,1397 @@@
++/*
++ *  linux/arch/x86_64/mm/init.c
++ *
++ *  Copyright (C) 1995  Linus Torvalds
++ *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
++ *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
++ *
++ *  Jun Nakajima <jun.nakajima@intel.com>
++ *    Modified for Xen.
++ */
++
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/string.h>
++#include <linux/types.h>
++#include <linux/ptrace.h>
++#include <linux/mman.h>
++#include <linux/mm.h>
++#include <linux/swap.h>
++#include <linux/smp.h>
++#include <linux/init.h>
++#include <linux/initrd.h>
++#include <linux/pagemap.h>
++#include <linux/bootmem.h>
++#include <linux/memblock.h>
++#include <linux/proc_fs.h>
++#include <linux/pci.h>
++#include <linux/pfn.h>
++#include <linux/poison.h>
++#include <linux/dma-mapping.h>
++#include <linux/module.h>
++#include <linux/memory_hotplug.h>
++#include <linux/nmi.h>
++#include <linux/gfp.h>
++
++#include <asm/processor.h>
++#include <asm/bios_ebda.h>
++#include <asm/system.h>
++#include <asm/uaccess.h>
++#include <asm/pgtable.h>
++#include <asm/pgalloc.h>
++#include <asm/dma.h>
++#include <asm/fixmap.h>
++#include <asm/e820.h>
++#include <asm/apic.h>
++#include <asm/tlb.h>
++#include <asm/mmu_context.h>
++#include <asm/proto.h>
++#include <asm/smp.h>
++#include <asm/sections.h>
++#include <asm/kdebug.h>
++#include <asm/numa.h>
++#include <asm/cacheflush.h>
++#include <asm/init.h>
++#include <asm/setup.h>
++
++#include <xen/features.h>
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++unsigned int __kernel_page_user;
++EXPORT_SYMBOL(__kernel_page_user);
++#endif
++
++extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
++extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
++
++/*
++ * Use this until direct mapping is established, i.e. before __va() is 
++ * available in init_memory_mapping().
++ */
++
++#define addr_to_page(addr, page)                              \
++      (addr) &= PHYSICAL_PAGE_MASK;                           \
++      (page) = ((unsigned long *) ((unsigned long)            \
++      (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
++      __START_KERNEL_map)))
++
++pmd_t *__init early_get_pmd(unsigned long va)
++{
++      unsigned long addr;
++      unsigned long *page = (unsigned long *)init_level4_pgt;
++
++      addr = page[pgd_index(va)];
++      addr_to_page(addr, page);
++
++      addr = page[pud_index(va)];
++      addr_to_page(addr, page);
++
++      return (pmd_t *)&page[pmd_index(va)];
++}
++
++void __meminit early_make_page_readonly(void *va, unsigned int feature)
++{
++      unsigned long addr, _va = (unsigned long)va;
++      pte_t pte, *ptep;
++      unsigned long *page = (unsigned long *) init_level4_pgt;
++
++      BUG_ON(after_bootmem);
++
++      if (xen_feature(feature))
++              return;
++
++      addr = (unsigned long) page[pgd_index(_va)];
++      addr_to_page(addr, page);
++
++      addr = page[pud_index(_va)];
++      addr_to_page(addr, page);
++
++      addr = page[pmd_index(_va)];
++      addr_to_page(addr, page);
++
++      ptep = (pte_t *) &page[pte_index(_va)];
++
++      pte.pte = ptep->pte & ~_PAGE_RW;
++      if (HYPERVISOR_update_va_mapping(_va, pte, 0))
++              BUG();
++}
++
++unsigned long __init early_arbitrary_virt_to_mfn(void *v)
++{
++      unsigned long va = (unsigned long)v, addr, *page;
++
++      BUG_ON(va < __START_KERNEL_map);
++
++      page = (void *)(xen_read_cr3() + __START_KERNEL_map);
++
++      addr = page[pgd_index(va)];
++      addr_to_page(addr, page);
++
++      addr = page[pud_index(va)];
++      addr_to_page(addr, page);
++
++      addr = page[pmd_index(va)];
++      addr_to_page(addr, page);
++
++      return (page[pte_index(va)] & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT;
++}
++
++#ifndef CONFIG_XEN
++static int __init parse_direct_gbpages_off(char *arg)
++{
++      direct_gbpages = 0;
++      return 0;
++}
++early_param("nogbpages", parse_direct_gbpages_off);
++
++static int __init parse_direct_gbpages_on(char *arg)
++{
++      direct_gbpages = 1;
++      return 0;
++}
++early_param("gbpages", parse_direct_gbpages_on);
++#endif
++
++/*
++ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
++ * physical space so we can cache the place of the first one and move
++ * around without checking the pgd every time.
++ */
++
++pteval_t __supported_pte_mask __read_mostly = ~0UL;
++EXPORT_SYMBOL_GPL(__supported_pte_mask);
++
++int force_personality32;
++
++/*
++ * noexec32=on|off
++ * Control non executable heap for 32bit processes.
++ * To control the stack too use noexec=off
++ *
++ * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
++ * off        PROT_READ implies PROT_EXEC
++ */
++static int __init nonx32_setup(char *str)
++{
++      if (!strcmp(str, "on"))
++              force_personality32 &= ~READ_IMPLIES_EXEC;
++      else if (!strcmp(str, "off"))
++              force_personality32 |= READ_IMPLIES_EXEC;
++      return 1;
++}
++__setup("noexec32=", nonx32_setup);
++
++/*
++ * When memory was added/removed make sure all the processes MM have
++ * suitable PGD entries in the local PGD level page.
++ */
++void sync_global_pgds(unsigned long start, unsigned long end)
++{
++      unsigned long address;
++
++      for (address = start; address <= end; address += PGDIR_SIZE) {
++              const pgd_t *pgd_ref = pgd_offset_k(address);
++              struct page *page;
++
++              if (pgd_none(*pgd_ref))
++                      continue;
++
++              spin_lock(&pgd_lock);
++              list_for_each_entry(page, &pgd_list, lru) {
++                      pgd_t *pgd;
++                      spinlock_t *pgt_lock;
++
++                      pgd = (pgd_t *)page_address(page) + pgd_index(address);
++                      /* the pgt_lock only for Xen */
++                      pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
++                      spin_lock(pgt_lock);
++
++                      if (pgd_none(*pgd))
++                              set_pgd(pgd, *pgd_ref);
++                      else
++                              BUG_ON(pgd_page_vaddr(*pgd)
++                                     != pgd_page_vaddr(*pgd_ref));
++
++                      spin_unlock(pgt_lock);
++              }
++              spin_unlock(&pgd_lock);
++      }
++}
++
++static struct reserved_pfn_range {
++      unsigned long pfn, nr;
++} reserved_pfn_ranges[3] __meminitdata;
++
++void __init reserve_pfn_range(unsigned long pfn, unsigned long nr, char *name)
++{
++      unsigned int i;
++
++      for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
++              struct reserved_pfn_range *range = reserved_pfn_ranges + i;
++
++              if (!range->nr) {
++                      range->pfn = pfn;
++                      range->nr = nr;
++                      break;
++              }
++              BUG_ON(range->pfn < pfn + nr && pfn < range->pfn + range->nr);
++              if (range->pfn > pfn) {
++                      i = ARRAY_SIZE(reserved_pfn_ranges) - 1;
++                      if (reserved_pfn_ranges[i].nr)
++                              continue;
++                      for (; reserved_pfn_ranges + i > range; --i)
++                              reserved_pfn_ranges[i]
++                                       = reserved_pfn_ranges[i - 1];
++                      range->pfn = pfn;
++                      range->nr = nr;
++                      break;
++              }
++      }
++      BUG_ON(i >= ARRAY_SIZE(reserved_pfn_ranges));
++      memblock_x86_reserve_range(pfn << PAGE_SHIFT,
++                                 (pfn + nr) << PAGE_SHIFT, name);
++}
++
++void __init reserve_pgtable_low(void)
++{
++      unsigned int i;
++
++      for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
++              struct reserved_pfn_range *range = reserved_pfn_ranges + i;
++
++              if (!range->nr)
++                      break;
++              if (pgt_buf_start <= range->pfn && pgt_buf_top > range->pfn) {
++                      x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
++                                      PFN_PHYS(range->pfn));
++                      pgt_buf_start = range->pfn + range->nr;
++              }
++      }
++}
++
++static __init unsigned long get_table_end(void)
++{
++      unsigned int i;
++
++      BUG_ON(!pgt_buf_end);
++      for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
++              struct reserved_pfn_range *range = reserved_pfn_ranges + i;
++
++              if (!range->nr)
++                      break;
++              if (pgt_buf_end == range->pfn) {
++                      pgt_buf_end += range->nr;
++                      pgt_buf_top += range->nr;
++              }
++      }
++      return pgt_buf_end++;
++}
++
++/*
++ * NOTE: This function is marked __ref because it calls __init function
++ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
++ */
++static __ref void *spp_getpage(void)
++{
++      void *ptr;
++
++      if (after_bootmem)
++              ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
++      else if (pgt_buf_end < pgt_buf_top) {
++              ptr = __va(get_table_end() << PAGE_SHIFT);
++              clear_page(ptr);
++      } else
++              ptr = alloc_bootmem_pages(PAGE_SIZE);
++
++      if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
++              panic("set_pte_phys: cannot allocate page data %s\n",
++                      after_bootmem ? "after bootmem" : "");
++      }
++
++      pr_debug("spp_getpage %p\n", ptr);
++
++      return ptr;
++}
++
++static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
++{
++      if (pgd_none(*pgd)) {
++              pud_t *pud = (pud_t *)spp_getpage();
++              if (!after_bootmem) {
++                      make_page_readonly(pud, XENFEAT_writable_page_tables);
++                      xen_l4_entry_update(pgd, __pgd(__pa(pud) | _PAGE_TABLE));
++              } else
++                      pgd_populate(&init_mm, pgd, pud);
++              if (pud != pud_offset(pgd, 0))
++                      printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
++                             pud, pud_offset(pgd, 0));
++      }
++      return pud_offset(pgd, vaddr);
++}
++
++static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
++{
++      if (pud_none(*pud)) {
++              pmd_t *pmd = (pmd_t *) spp_getpage();
++              if (!after_bootmem) {
++                      make_page_readonly(pmd, XENFEAT_writable_page_tables);
++                      xen_l3_entry_update(pud, __pud(__pa(pmd) | _PAGE_TABLE));
++              } else
++                      pud_populate(&init_mm, pud, pmd);
++              if (pmd != pmd_offset(pud, 0))
++                      printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
++                             pmd, pmd_offset(pud, 0));
++      }
++      return pmd_offset(pud, vaddr);
++}
++
++static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
++{
++      if (pmd_none(*pmd)) {
++              pte_t *pte = (pte_t *) spp_getpage();
++              make_page_readonly(pte, XENFEAT_writable_page_tables);
++              pmd_populate_kernel(&init_mm, pmd, pte);
++              if (pte != pte_offset_kernel(pmd, 0))
++                      printk(KERN_ERR "PAGETABLE BUG #02!\n");
++      }
++      return pte_offset_kernel(pmd, vaddr);
++}
++
++void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
++{
++      pud_t *pud;
++      pmd_t *pmd;
++      pte_t *pte;
++
++      pud = pud_page + pud_index(vaddr);
++      pmd = fill_pmd(pud, vaddr);
++      pte = fill_pte(pmd, vaddr);
++
++      set_pte(pte, new_pte);
++
++      /*
++       * It's enough to flush this one mapping.
++       * (PGE mappings get flushed as well)
++       */
++      __flush_tlb_one(vaddr);
++}
++
++void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
++{
++      pgd_t *pgd;
++      pud_t *pud_page;
++
++      pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
++
++      pgd = pgd_offset_k(vaddr);
++      if (pgd_none(*pgd)) {
++              printk(KERN_ERR
++                      "PGD FIXMAP MISSING, it should be setup in head.S!\n");
++              return;
++      }
++      pud_page = (pud_t*)pgd_page_vaddr(*pgd);
++      set_pte_vaddr_pud(pud_page, vaddr, pteval);
++}
++
++pmd_t * __init populate_extra_pmd(unsigned long vaddr)
++{
++      pgd_t *pgd;
++      pud_t *pud;
++
++      pgd = pgd_offset_k(vaddr);
++      pud = fill_pud(pgd, vaddr);
++      return fill_pmd(pud, vaddr);
++}
++
++pte_t * __init populate_extra_pte(unsigned long vaddr)
++{
++      pmd_t *pmd;
++
++      pmd = populate_extra_pmd(vaddr);
++      return fill_pte(pmd, vaddr);
++}
++
++#ifndef CONFIG_XEN
++/*
++ * Create large page table mappings for a range of physical addresses.
++ */
++static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
++                                              pgprot_t prot)
++{
++      pgd_t *pgd;
++      pud_t *pud;
++      pmd_t *pmd;
++
++      BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
++      for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
++              pgd = pgd_offset_k((unsigned long)__va(phys));
++              if (pgd_none(*pgd)) {
++                      pud = (pud_t *) spp_getpage();
++                      set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
++                                              _PAGE_USER));
++              }
++              pud = pud_offset(pgd, (unsigned long)__va(phys));
++              if (pud_none(*pud)) {
++                      pmd = (pmd_t *) spp_getpage();
++                      set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
++                                              _PAGE_USER));
++              }
++              pmd = pmd_offset(pud, phys);
++              BUG_ON(!pmd_none(*pmd));
++              set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
++      }
++}
++
++void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
++{
++      __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
++}
++
++void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
++{
++      __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
++}
++
++/*
++ * The head.S code sets up the kernel high mapping:
++ *
++ *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
++ *
++ * phys_addr holds the negative offset to the kernel, which is added
++ * to the compile time generated pmds. This results in invalid pmds up
++ * to the point where we hit the physaddr 0 mapping.
++ *
++ * We limit the mappings to the region from _text to _brk_end.  _brk_end
++ * is rounded up to the 2MB boundary. This catches the invalid pmds as
++ * well, as they are located before _text:
++ */
++void __init cleanup_highmap(void)
++{
++      unsigned long vaddr = __START_KERNEL_map;
++      unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
++      unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
++      pmd_t *pmd = level2_kernel_pgt;
++
++      for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
++              if (pmd_none(*pmd))
++                      continue;
++              if (vaddr < (unsigned long) _text || vaddr > end)
++                      set_pmd(pmd, __pmd(0));
++      }
++}
++#endif
++
++static __ref void *alloc_low_page(unsigned long *phys)
++{
++      unsigned long pfn;
++      void *adr;
++
++      if (after_bootmem) {
++              adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
++              *phys = __pa(adr);
++
++              return adr;
++      }
++
++      pfn = get_table_end();
++      if (pfn >= pgt_buf_top)
++              panic("alloc_low_page: ran out of memory");
++
++      adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
++      clear_page(adr);
++      *phys  = pfn * PAGE_SIZE;
++      return adr;
++}
++
++static __ref void *map_low_page(void *virt)
++{
++      void *adr;
++      unsigned long phys, left;
++
++      if (after_bootmem)
++              return virt;
++
++      phys = __pa(virt);
++      left = phys & (PAGE_SIZE - 1);
++      adr = early_memremap_ro(phys & PAGE_MASK, PAGE_SIZE);
++      adr = (void *)(((unsigned long)adr) | left);
++
++      return adr;
++}
++
++static __ref void unmap_low_page(void *adr)
++{
++      if (after_bootmem)
++              return;
++
++      early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
++}
++
++static inline int __meminit make_readonly(unsigned long paddr)
++{
++      extern char __vsyscall_0;
++      int readonly = 0;
++
++      /* Make new page tables read-only on the first pass. */
++      if (!xen_feature(XENFEAT_writable_page_tables)
++          && !max_pfn_mapped
++          && (paddr >= (pgt_buf_start << PAGE_SHIFT))) {
++              unsigned long top = pgt_buf_top;
++              unsigned int i;
++
++              /* Account for the ranges get_table_end() skips. */
++              for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
++                      const struct reserved_pfn_range *range;
++
++                      range = reserved_pfn_ranges + i;
++                      if (!range->nr)
++                              continue;
++                      if (pgt_buf_end <= range->pfn && top > range->pfn) {
++                              if (paddr > (range->pfn << PAGE_SHIFT)
++                                  && paddr < ((range->pfn + range->nr)
++                                              << PAGE_SHIFT))
++                                      break;
++                              top += range->nr;
++                      }
++              }
++              if (paddr < (top << PAGE_SHIFT))
++                      readonly = (i >= ARRAY_SIZE(reserved_pfn_ranges));
++      }
++      /* Make old page tables read-only. */
++      if (!xen_feature(XENFEAT_writable_page_tables)
++          && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
++          && (paddr < (pgt_buf_end << PAGE_SHIFT)))
++              readonly = 1;
++      /* Make P->M table (and its page tables) read-only. */
++      if (!xen_feature(XENFEAT_writable_page_tables)
++          && xen_start_info->mfn_list < __START_KERNEL_map
++          && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
++          && paddr < (xen_start_info->first_p2m_pfn
++                      + xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
++              readonly = 1;
++
++      /*
++       * No need for writable mapping of kernel image. This also ensures that
++       * page and descriptor tables embedded inside don't have writable
++       * mappings. Exclude the vsyscall area here, allowing alternative
++       * instruction patching to work. The range must be in sync with that
++       * passed to reserve_early() (as "TEXT DATA BSS"), since all other
++       * regions can be allocated from under CONFIG_NO_BOOTMEM and thus must
++       * be writable.
++       */
++      if ((paddr >= __pa_symbol(&_text))
++            && (paddr < (__pa_symbol(__bss_stop) & PAGE_MASK))
++          && !(paddr >= __pa_symbol(&__vsyscall_0)
++               && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
++              readonly = 1;
++
++      return readonly;
++}
++
++static unsigned long __meminit
++phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
++            pgprot_t prot)
++{
++      unsigned pages = 0;
++      unsigned long last_map_addr = end;
++      int i;
++
++      pte_t *pte = pte_page + pte_index(addr);
++
++      for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
++              unsigned long pteval = addr | pgprot_val(prot);
++
++              if (addr >= end ||
++                  (!after_bootmem &&
++                   (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages))
++                      break;
++
++              /*
++               * We will re-use the existing mapping.
++               * Xen for example has some special requirements, like mapping
++               * pagetable pages as RO. So assume someone who pre-setup
++               * these mappings are more intelligent.
++               */
++              if (__pte_val(*pte)) {
++                      pages++;
++                      continue;
++              }
++
++              if (make_readonly(addr))
++                      pteval &= ~_PAGE_RW;
++              if (0)
++                      printk("   pte=%p addr=%lx pte=%016lx\n",
++                             pte, addr, pteval);
++              pages++;
++              if (!after_bootmem)
++                      *pte = __pte(pteval & __supported_pte_mask);
++              else
++                      set_pte(pte, __pte(pteval & __supported_pte_mask));
++              last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
++      }
++
++      update_page_count(PG_LEVEL_4K, pages);
++
++      return last_map_addr;
++}
++
++static unsigned long __meminit
++phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
++            unsigned long page_size_mask, pgprot_t prot)
++{
++      unsigned long pages = 0;
++      unsigned long last_map_addr = end;
++
++      int i = pmd_index(address);
++
++      for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
++              unsigned long pte_phys;
++              pmd_t *pmd = pmd_page + pmd_index(address);
++              pte_t *pte;
++              pgprot_t new_prot = prot;
++
++              if (address >= end)
++                      break;
++
++              if (__pmd_val(*pmd)) {
++                      if (!pmd_large(*pmd)) {
++                              spin_lock(&init_mm.page_table_lock);
++                              pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
++                              last_map_addr = phys_pte_init(pte, address,
++                                                              end, prot);
++                              unmap_low_page(pte);
++                              spin_unlock(&init_mm.page_table_lock);
++                              continue;
++                      }
++                      /*
++                       * If we are ok with PG_LEVEL_2M mapping, then we will
++                       * use the existing mapping,
++                       *
++                       * Otherwise, we will split the large page mapping but
++                       * use the same existing protection bits except for
++                       * large page, so that we don't violate Intel's TLB
++                       * Application note (317080) which says, while changing
++                       * the page sizes, new and old translations should
++                       * not differ with respect to page frame and
++                       * attributes.
++                       */
++                      if (page_size_mask & (1 << PG_LEVEL_2M)) {
++                              pages++;
++                              continue;
++                      }
++                      new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
++              }
++
++              if (page_size_mask & (1<<PG_LEVEL_2M)) {
++                      pages++;
++                      spin_lock(&init_mm.page_table_lock);
++                      set_pte((pte_t *)pmd,
++                              pfn_pte(address >> PAGE_SHIFT,
++                                      __pgprot(pgprot_val(prot) | _PAGE_PSE)));
++                      spin_unlock(&init_mm.page_table_lock);
++                      last_map_addr = (address & PMD_MASK) + PMD_SIZE;
++                      continue;
++              }
++
++              pte = alloc_low_page(&pte_phys);
++              last_map_addr = phys_pte_init(pte, address, end, new_prot);
++              unmap_low_page(pte);
++
++              if (!after_bootmem) {
++                      if (max_pfn_mapped)
++                              make_page_readonly(__va(pte_phys),
++                                                 XENFEAT_writable_page_tables);
++                      *pmd = __pmd(pte_phys | _PAGE_TABLE);
++              } else {
++                      spin_lock(&init_mm.page_table_lock);
++                      pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
++                      spin_unlock(&init_mm.page_table_lock);
++              }
++      }
++      update_page_count(PG_LEVEL_2M, pages);
++      return last_map_addr;
++}
++
++static unsigned long __meminit
++phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
++                       unsigned long page_size_mask)
++{
++      unsigned long pages = 0;
++      unsigned long last_map_addr = end;
++      int i = pud_index(addr);
++
++      for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
++              unsigned long pmd_phys;
++              pud_t *pud = pud_page + pud_index(addr);
++              pmd_t *pmd;
++              pgprot_t prot = PAGE_KERNEL;
++
++              if (addr >= end)
++                      break;
++
++              if (__pud_val(*pud)) {
++                      if (!pud_large(*pud)) {
++                              pmd = map_low_page(pmd_offset(pud, 0));
++                              last_map_addr = phys_pmd_init(pmd, addr, end,
++                                                       page_size_mask, prot);
++                              unmap_low_page(pmd);
++                              __flush_tlb_all();
++                              continue;
++                      }
++                      /*
++                       * If we are ok with PG_LEVEL_1G mapping, then we will
++                       * use the existing mapping.
++                       *
++                       * Otherwise, we will split the gbpage mapping but use
++                       * the same existing protection  bits except for large
++                       * page, so that we don't violate Intel's TLB
++                       * Application note (317080) which says, while changing
++                       * the page sizes, new and old translations should
++                       * not differ with respect to page frame and
++                       * attributes.
++                       */
++                      if (page_size_mask & (1 << PG_LEVEL_1G)) {
++                              pages++;
++                              continue;
++                      }
++                      prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
++              }
++
++              if (page_size_mask & (1<<PG_LEVEL_1G)) {
++                      pages++;
++                      spin_lock(&init_mm.page_table_lock);
++                      set_pte((pte_t *)pud,
++                              pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
++                      spin_unlock(&init_mm.page_table_lock);
++                      last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
++                      continue;
++              }
++
++              pmd = alloc_low_page(&pmd_phys);
++              last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
++                                            prot);
++              unmap_low_page(pmd);
++
++              if (!after_bootmem) {
++                      if (max_pfn_mapped)
++                              make_page_readonly(__va(pmd_phys),
++                                                 XENFEAT_writable_page_tables);
++                      if (page_size_mask & (1 << PG_LEVEL_NUM)) {
++                              mmu_update_t u;
++
++                              u.ptr = arbitrary_virt_to_machine(pud);
++                              u.val = phys_to_machine(pmd_phys) | _PAGE_TABLE;
++                              if (HYPERVISOR_mmu_update(&u, 1, NULL,
++                                                        DOMID_SELF) < 0)
++                                      BUG();
++                      } else
++                              *pud = __pud(pmd_phys | _PAGE_TABLE);
++              } else {
++                      spin_lock(&init_mm.page_table_lock);
++                      pud_populate(&init_mm, pud, __va(pmd_phys));
++                      spin_unlock(&init_mm.page_table_lock);
++              }
++      }
++      __flush_tlb_all();
++
++      update_page_count(PG_LEVEL_1G, pages);
++
++      return last_map_addr;
++}
++
++void __init xen_init_pt(void)
++{
++      unsigned long addr, *page;
++
++      /* Find the initial pte page that was built for us. */
++      page = (unsigned long *)xen_start_info->pt_base;
++      addr = page[pgd_index(__START_KERNEL_map)];
++      addr_to_page(addr, page);
++
++#if CONFIG_XEN_COMPAT <= 0x030002
++      /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
++         in kernel PTEs. We check that here. */
++      if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
++              unsigned long *pg;
++              pte_t pte;
++
++              /* Mess with the initial mapping of page 0. It's not needed. */
++              BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
++              addr = page[pud_index(__START_KERNEL_map)];
++              addr_to_page(addr, pg);
++              addr = pg[pmd_index(__START_KERNEL_map)];
++              addr_to_page(addr, pg);
++              pte.pte = pg[pte_index(__START_KERNEL_map)];
++              BUG_ON(!(pte.pte & _PAGE_PRESENT));
++
++              /* If _PAGE_USER isn't set, we obviously do not need it. */
++              if (pte.pte & _PAGE_USER) {
++                      /* _PAGE_USER is needed, but is it set implicitly? */
++                      pte.pte &= ~_PAGE_USER;
++                      if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
++                                                        pte, 0) != 0) ||
++                          !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
++                              /* We need to explicitly specify _PAGE_USER. */
++                              __kernel_page_user = _PAGE_USER;
++              }
++      }
++#endif
++
++      /* Construct mapping of initial pte page in our own directories. */
++      init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
++              __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
++      memcpy(level3_kernel_pgt + pud_index(__START_KERNEL_map),
++             page + pud_index(__START_KERNEL_map),
++             (PTRS_PER_PUD - pud_index(__START_KERNEL_map))
++             * sizeof(*level3_kernel_pgt));
++
++      /* Copy the initial P->M table mappings if necessary. */
++      addr = pgd_index(xen_start_info->mfn_list);
++      if (addr < pgd_index(__START_KERNEL_map))
++              init_level4_pgt[addr] =
++                      ((pgd_t *)xen_start_info->pt_base)[addr];
++
++      /* Do an early initialization of the fixmap area. */
++      addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
++      if (pud_present(level3_kernel_pgt[pud_index(addr)])) {
++              unsigned long adr = page[pud_index(addr)];
++
++              addr_to_page(adr, page);
++              copy_page(level2_fixmap_pgt, page);
++      }
++      level3_kernel_pgt[pud_index(addr)] =
++              __pud(__pa_symbol(level2_fixmap_pgt) | _PAGE_TABLE);
++      level2_fixmap_pgt[pmd_index(addr)] =
++              __pmd(__pa_symbol(level1_fixmap_pgt) | _PAGE_TABLE);
++
++      early_make_page_readonly(init_level4_pgt,
++                               XENFEAT_writable_page_tables);
++      early_make_page_readonly(level3_kernel_pgt,
++                               XENFEAT_writable_page_tables);
++      early_make_page_readonly(level3_user_pgt,
++                               XENFEAT_writable_page_tables);
++      early_make_page_readonly(level2_fixmap_pgt,
++                               XENFEAT_writable_page_tables);
++      early_make_page_readonly(level1_fixmap_pgt,
++                               XENFEAT_writable_page_tables);
++
++      if (!xen_feature(XENFEAT_writable_page_tables))
++              xen_pgd_pin(init_level4_pgt);
++}
++
++void __init xen_finish_init_mapping(void)
++{
++      unsigned long start, end;
++      struct mmuext_op mmuext;
++
++      /* Re-vector virtual addresses pointing into the initial
++         mapping to the just-established permanent ones. */
++      xen_start_info = __va(__pa(xen_start_info));
++      xen_start_info->pt_base = (unsigned long)
++              __va(__pa(xen_start_info->pt_base));
++      if (!xen_feature(XENFEAT_auto_translated_physmap)
++          && xen_start_info->mfn_list >= __START_KERNEL_map)
++              phys_to_machine_mapping =
++                      __va(__pa(xen_start_info->mfn_list));
++
++      /* Unpin the no longer used Xen provided page tables. */
++      mmuext.cmd = MMUEXT_UNPIN_TABLE;
++      mmuext.arg1.mfn = virt_to_mfn(xen_start_info->pt_base);
++      if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
++              BUG();
++
++      /* Destroy the Xen-created mappings beyond the kernel image. */
++      start = PAGE_ALIGN(_brk_end);
++      end   = __START_KERNEL_map + (pgt_buf_start << PAGE_SHIFT);
++      for (; start < end; start += PAGE_SIZE)
++              if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
++                      BUG();
++
++      WARN(pgt_buf_end != pgt_buf_top, "start=%lx cur=%lx top=%lx\n",
++           pgt_buf_start, pgt_buf_end, pgt_buf_top);
++      if (pgt_buf_end > pgt_buf_top)
++              pgt_buf_top = pgt_buf_end;
++}
++
++unsigned long __meminit
++kernel_physical_mapping_init(unsigned long start,
++                           unsigned long end,
++                           unsigned long page_size_mask)
++{
++      bool pgd_changed = false;
++      unsigned long next, last_map_addr = end;
++      unsigned long addr;
++
++      start = (unsigned long)__va(start);
++      end = (unsigned long)__va(end);
++      addr = start;
++
++      for (; start < end; start = next) {
++              pgd_t *pgd = pgd_offset_k(start);
++              unsigned long pud_phys;
++              pud_t *pud;
++
++              next = (start + PGDIR_SIZE) & PGDIR_MASK;
++              if (next > end)
++                      next = end;
++
++              if (__pgd_val(*pgd)) {
++                      pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
++                      last_map_addr = phys_pud_init(pud, __pa(start),
++                              __pa(end), page_size_mask | (1 << PG_LEVEL_NUM));
++                      unmap_low_page(pud);
++                      continue;
++              }
++
++              pud = alloc_low_page(&pud_phys);
++              last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
++                                               page_size_mask);
++              unmap_low_page(pud);
++
++              if (!after_bootmem) {
++                      if (max_pfn_mapped)
++                              make_page_readonly(__va(pud_phys),
++                                                 XENFEAT_writable_page_tables);
++                      xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
++              } else {
++                      spin_lock(&init_mm.page_table_lock);
++                      pgd_populate(&init_mm, pgd, __va(pud_phys));
++                      spin_unlock(&init_mm.page_table_lock);
++                      pgd_changed = true;
++              }
++      }
++
++      if (pgd_changed)
++              sync_global_pgds(addr, end);
++
++      return last_map_addr;
++}
++
++#ifndef CONFIG_NUMA
++void __init initmem_init(void)
++{
++      memblock_x86_register_active_regions(0, 0, max_pfn);
++#ifdef CONFIG_XEN
++      if (max_pfn > xen_start_info->nr_pages)
++              memblock_x86_reserve_range(xen_start_info->nr_pages << PAGE_SHIFT,
++                                         max_pfn << PAGE_SHIFT, "BALLOON");
++#endif
++}
++#endif
++
++void __init paging_init(void)
++{
++      unsigned long max_zone_pfns[MAX_NR_ZONES];
++
++      memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
++      max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
++      max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
++      max_zone_pfns[ZONE_NORMAL] = max_pfn;
++
++      sparse_memory_present_with_active_regions(MAX_NUMNODES);
++      sparse_init();
++
++      /*
++       * clear the default setting with node 0
++       * note: don't use nodes_clear here, that is really clearing when
++       *       numa support is not compiled in, and later node_set_state
++       *       will not set it back.
++       */
++      node_clear_state(0, N_NORMAL_MEMORY);
++
++      free_area_init_nodes(max_zone_pfns);
++
++      xen_init_pgd_pin();
++}
++
++/*
++ * Memory hotplug specific functions
++ */
++#ifdef CONFIG_MEMORY_HOTPLUG
++/*
++ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
++ * updating.
++ */
++static void  update_end_of_memory_vars(u64 start, u64 size)
++{
++      unsigned long end_pfn = PFN_UP(start + size);
++
++      if (end_pfn > max_pfn) {
++              max_pfn = end_pfn;
++              max_low_pfn = end_pfn;
++              high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
++      }
++}
++
++/*
++ * Memory is added always to NORMAL zone. This means you will never get
++ * additional DMA/DMA32 memory.
++ */
++int arch_add_memory(int nid, u64 start, u64 size)
++{
++      struct pglist_data *pgdat = NODE_DATA(nid);
++      struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
++      unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
++      unsigned long nr_pages = size >> PAGE_SHIFT;
++      int ret;
++
++      last_mapped_pfn = init_memory_mapping(start, start + size);
++      if (last_mapped_pfn > max_pfn_mapped)
++              max_pfn_mapped = last_mapped_pfn;
++
++      ret = __add_pages(nid, zone, start_pfn, nr_pages);
++      WARN_ON_ONCE(ret);
++
++      /* update max_pfn, max_low_pfn and high_memory */
++      update_end_of_memory_vars(start, size);
++
++      return ret;
++}
++EXPORT_SYMBOL_GPL(arch_add_memory);
++
++#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
++int memory_add_physaddr_to_nid(u64 start)
++{
++      return 0;
++}
++EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
++#endif
++
++#endif /* CONFIG_MEMORY_HOTPLUG */
++
++static struct kcore_list kcore_vsyscall;
++
++void __init mem_init(void)
++{
++      long codesize, reservedpages, datasize, initsize;
++      unsigned long absent_pages;
++      unsigned long pfn;
++
++      pci_iommu_alloc();
++
++      /* clear_bss() already clear the empty_zero_page */
++
++      reservedpages = 0;
++
++      /* this will put all low memory onto the freelists */
++#ifdef CONFIG_NUMA
++      totalram_pages = numa_free_all_bootmem();
++#else
++      totalram_pages = free_all_bootmem();
++#endif
++
++      /* XEN: init pages outside initial allocation. */
++      for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
++              ClearPageReserved(pfn_to_page(pfn));
++              init_page_count(pfn_to_page(pfn));
++      }
++
++      absent_pages = absent_pages_in_range(0, max_pfn);
++      reservedpages = max_pfn - totalram_pages - absent_pages;
++      after_bootmem = 1;
++
++      codesize =  (unsigned long) &_etext - (unsigned long) &_text;
++      datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
++      initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
++
++      /* Register memory areas for /proc/kcore */
++      kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
++                       VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
++
++      printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
++                       "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
++              nr_free_pages() << (PAGE_SHIFT-10),
++              max_pfn << (PAGE_SHIFT-10),
++              codesize >> 10,
++              absent_pages << (PAGE_SHIFT-10),
++              reservedpages << (PAGE_SHIFT-10),
++              datasize >> 10,
++              initsize >> 10);
++}
++
++#ifdef CONFIG_DEBUG_RODATA
++const int rodata_test_data = 0xC3;
++EXPORT_SYMBOL_GPL(rodata_test_data);
++
++int kernel_set_to_readonly;
++
++void set_kernel_text_rw(void)
++{
++      unsigned long start = PFN_ALIGN(_text);
++      unsigned long end = PFN_ALIGN(__stop___ex_table);
++
++      if (!kernel_set_to_readonly)
++              return;
++
++      pr_debug("Set kernel text: %lx - %lx for read write\n",
++               start, end);
++
++      /*
++       * Make the kernel identity mapping for text RW. Kernel text
++       * mapping will always be RO. Refer to the comment in
++       * static_protections() in pageattr.c
++       */
++      set_memory_rw(start, (end - start) >> PAGE_SHIFT);
++}
++
++void set_kernel_text_ro(void)
++{
++      unsigned long start = PFN_ALIGN(_text);
++      unsigned long end = PFN_ALIGN(__stop___ex_table);
++
++      if (!kernel_set_to_readonly)
++              return;
++
++      pr_debug("Set kernel text: %lx - %lx for read only\n",
++               start, end);
++
++      /*
++       * Set the kernel identity mapping for text RO.
++       */
++      set_memory_ro(start, (end - start) >> PAGE_SHIFT);
++}
++
++void mark_rodata_ro(void)
++{
++      unsigned long start = PFN_ALIGN(_text);
++      unsigned long rodata_start =
++              ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
++      unsigned long end = (unsigned long) &__end_rodata;
++      unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
++      unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
++      unsigned long data_start = (unsigned long) &_sdata;
++
++      printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
++             (end - start) >> 10);
++      set_memory_ro(start, (end - start) >> PAGE_SHIFT);
++
++      kernel_set_to_readonly = 1;
++
++      /*
++       * The rodata section (but not the kernel text!) should also be
++       * not-executable.
++       */
++      set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
++
++      rodata_test();
++
++#ifdef CONFIG_CPA_DEBUG
++      printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
++      set_memory_rw(start, (end-start) >> PAGE_SHIFT);
++
++      printk(KERN_INFO "Testing CPA: again\n");
++      set_memory_ro(start, (end-start) >> PAGE_SHIFT);
++#endif
++
++      free_init_pages("unused kernel memory",
++                      (unsigned long) page_address(virt_to_page(text_end)),
++                      (unsigned long)
++                               page_address(virt_to_page(rodata_start)));
++      free_init_pages("unused kernel memory",
++                      (unsigned long) page_address(virt_to_page(rodata_end)),
++                      (unsigned long) page_address(virt_to_page(data_start)));
++}
++
++#endif
++
++int kern_addr_valid(unsigned long addr)
++{
++      unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
++      pgd_t *pgd;
++      pud_t *pud;
++      pmd_t *pmd;
++      pte_t *pte;
++
++      if (above != 0 && above != -1UL)
++              return 0;
++
++#ifdef CONFIG_XEN
++      /*
++       * Don't walk page tables for hypervisor addresses, but allow
++       * the M2P table to be accessed through e.g. /proc/kcore.
++       */
++      if (addr >= (unsigned long)machine_to_phys_mapping &&
++          addr < (unsigned long)(machine_to_phys_mapping +
++                                 (1UL << machine_to_phys_order)))
++              return 1;
++      if (addr >= HYPERVISOR_VIRT_START && addr < HYPERVISOR_VIRT_END)
++              return 0;
++#endif
++
++      pgd = pgd_offset_k(addr);
++      if (pgd_none(*pgd))
++              return 0;
++
++      pud = pud_offset(pgd, addr);
++      if (pud_none(*pud))
++              return 0;
++
++      pmd = pmd_offset(pud, addr);
++      if (pmd_none(*pmd))
++              return 0;
++
++      if (pmd_large(*pmd))
++              return pfn_valid(pmd_pfn(*pmd));
++
++      pte = pte_offset_kernel(pmd, addr);
++      if (pte_none(*pte))
++              return 0;
++
++      return pfn_valid(pte_pfn(*pte));
++}
++
++/*
++ * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
++ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
++ * not need special handling anymore:
++ */
++static struct vm_area_struct gate_vma = {
++      .vm_start       = VSYSCALL_START,
++      .vm_end         = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
++      .vm_page_prot   = PAGE_READONLY_EXEC,
++      .vm_flags       = VM_READ | VM_EXEC
++};
++
++struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
++{
++#ifdef CONFIG_IA32_EMULATION
++      if (!mm || mm->context.ia32_compat)
++              return NULL;
++#endif
++      return &gate_vma;
++}
++
++int in_gate_area(struct mm_struct *mm, unsigned long addr)
++{
++      struct vm_area_struct *vma = get_gate_vma(mm);
++
++      if (!vma)
++              return 0;
++
++      return (addr >= vma->vm_start) && (addr < vma->vm_end);
++}
++
++/*
++ * Use this when you have no reliable mm, typically from interrupt
++ * context. It is less reliable than using a task's mm and may give
++ * false positives.
++ */
++int in_gate_area_no_mm(unsigned long addr)
++{
++      return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
++}
++
++const char *arch_vma_name(struct vm_area_struct *vma)
++{
++      if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
++              return "[vdso]";
++      if (vma == &gate_vma)
++              return "[vsyscall]";
++      return NULL;
++}
++
++#ifdef CONFIG_X86_UV
++#define MIN_MEMORY_BLOCK_SIZE   (1 << SECTION_SIZE_BITS)
++
++unsigned long memory_block_size_bytes(void)
++{
++      if (is_uv_system()) {
++              printk(KERN_INFO "UV: memory block size 2GB\n");
++              return 2UL * 1024 * 1024 * 1024;
++      }
++      return MIN_MEMORY_BLOCK_SIZE;
++}
++#endif
++
++#ifdef CONFIG_SPARSEMEM_VMEMMAP
++/*
++ * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
++ */
++static long __meminitdata addr_start, addr_end;
++static void __meminitdata *p_start, *p_end;
++static int __meminitdata node_start;
++
++int __meminit
++vmemmap_populate(struct page *start_page, unsigned long size, int node)
++{
++      unsigned long addr = (unsigned long)start_page;
++      unsigned long end = (unsigned long)(start_page + size);
++      unsigned long next;
++      pgd_t *pgd;
++      pud_t *pud;
++      pmd_t *pmd;
++
++      for (; addr < end; addr = next) {
++              void *p = NULL;
++
++              pgd = vmemmap_pgd_populate(addr, node);
++              if (!pgd)
++                      return -ENOMEM;
++
++              pud = vmemmap_pud_populate(pgd, addr, node);
++              if (!pud)
++                      return -ENOMEM;
++
++              if (!cpu_has_pse) {
++                      next = (addr + PAGE_SIZE) & PAGE_MASK;
++                      pmd = vmemmap_pmd_populate(pud, addr, node);
++
++                      if (!pmd)
++                              return -ENOMEM;
++
++                      p = vmemmap_pte_populate(pmd, addr, node);
++
++                      if (!p)
++                              return -ENOMEM;
++
++                      addr_end = addr + PAGE_SIZE;
++                      p_end = p + PAGE_SIZE;
++              } else {
++                      next = pmd_addr_end(addr, end);
++
++                      pmd = pmd_offset(pud, addr);
++                      if (pmd_none(*pmd)) {
++                              pte_t entry;
++
++                              p = vmemmap_alloc_block_buf(PMD_SIZE, node);
++                              if (!p)
++                                      return -ENOMEM;
++
++                              entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
++                                              PAGE_KERNEL_LARGE);
++                              set_pmd(pmd, __pmd_ma(__pte_val(entry)));
++
++                              /* check to see if we have contiguous blocks */
++                              if (p_end != p || node_start != node) {
++                                      if (p_start)
++                                              printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
++                                                     addr_start, addr_end-1, p_start, p_end-1, node_start);
++                                      addr_start = addr;
++                                      node_start = node;
++                                      p_start = p;
++                              }
++
++                              addr_end = addr + PMD_SIZE;
++                              p_end = p + PMD_SIZE;
++                      } else
++                              vmemmap_verify((pte_t *)pmd, node, addr, next);
++              }
++
++      }
++      sync_global_pgds((unsigned long)start_page, end);
++      return 0;
++}
++
++void __meminit vmemmap_populate_print_last(void)
++{
++      if (p_start) {
++              printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
++                      addr_start, addr_end-1, p_start, p_end-1, node_start);
++              p_start = NULL;
++              p_end = NULL;
++              node_start = 0;
++      }
++}
++#endif
diff --cc arch/x86/mm/iomap_32-xen.c

index 0000000,0000000..bbd4134

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/iomap_32-xen.c
@@@ -1,0 -1,0 +1,121 @@@
++/*
++ * Copyright © 2008 Ingo Molnar
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with this program; if not, write to the Free Software Foundation, Inc.,
++ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
++ */
++
++#include <asm/iomap.h>
++#include <asm/pat.h>
++#include <linux/bitops.h>
++#include <linux/module.h>
++#include <linux/highmem.h>
++
++static int is_io_mapping_possible(resource_size_t base, unsigned long size)
++{
++#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
++      /* There is no way to map greater than 1 << 32 address without PAE */
++      if (base + size > 0x100000000ULL)
++              return 0;
++#endif
++      return 1;
++}
++
++int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
++{
++      unsigned long flag = _PAGE_CACHE_WC;
++      int ret;
++
++      if (!is_io_mapping_possible(base, size))
++              return -EINVAL;
++
++      ret = io_reserve_memtype(base, base + size, &flag);
++      if (ret)
++              return ret;
++
++      *prot = __pgprot(__PAGE_KERNEL | flag);
++      return 0;
++}
++EXPORT_SYMBOL_GPL(iomap_create_wc);
++
++void iomap_free(resource_size_t base, unsigned long size)
++{
++      io_free_memtype(base, base + size);
++}
++EXPORT_SYMBOL_GPL(iomap_free);
++
++void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
++{
++      unsigned long vaddr;
++      int idx, type;
++
++      pagefault_disable();
++
++      type = kmap_atomic_idx_push();
++      idx = type + KM_TYPE_NR * smp_processor_id();
++      vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
++      set_pte_at(&init_mm, vaddr, kmap_pte - idx, pfn_pte(pfn, prot));
++      /*arch_flush_lazy_mmu_mode();*/
++
++      return (void *)vaddr;
++}
++
++/*
++ * Map 'mfn' using protections 'prot'
++ */
++void __iomem *
++iomap_atomic_prot_pfn(unsigned long mfn, pgprot_t prot)
++{
++      /*
++       * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
++       * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
++       * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
++       * user, which is "WC if the MTRR is WC, UC if you can't do that."
++       */
++      if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
++              prot = PAGE_KERNEL_UC_MINUS;
++
++      pgprot_val(prot) |= _PAGE_IOMAP;
++      return (void __force __iomem *) kmap_atomic_prot_pfn(mfn, prot);
++}
++EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
++
++void
++iounmap_atomic(void __iomem *kvaddr)
++{
++      unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
++
++      if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
++          vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
++              int idx, type;
++
++              type = kmap_atomic_idx();
++              idx = type + KM_TYPE_NR * smp_processor_id();
++
++#ifdef CONFIG_DEBUG_HIGHMEM
++              WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
++#endif
++              /*
++               * Force other mappings to Oops if they'll try to access this
++               * pte without first remap it.  Keeping stale mappings around
++               * is a bad idea also, in case the page changes cacheability
++               * attributes or becomes a protected page in a hypervisor.
++               */
++              kpte_clear_flush(kmap_pte-idx, vaddr);
++              kmap_atomic_idx_pop();
++      }
++
++      pagefault_enable();
++}
++EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --cc arch/x86/mm/ioremap-xen.c

index 0000000,0000000..64463b0

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/ioremap-xen.c
@@@ -1,0 -1,0 +1,827 @@@
++/*
++ * Re-map IO memory to kernel address space so that we can access it.
++ * This is needed for high PCI addresses that aren't mapped in the
++ * 640k-1MB IO memory area on PC's
++ *
++ * (C) Copyright 1995 1996 Linus Torvalds
++ */
++
++#include <linux/bootmem.h>
++#include <linux/init.h>
++#include <linux/io.h>
++#include <linux/module.h>
++#include <linux/pfn.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/mmiotrace.h>
++
++#include <asm/cacheflush.h>
++#include <asm/e820.h>
++#include <asm/fixmap.h>
++#include <asm/pgtable.h>
++#include <asm/tlbflush.h>
++#include <asm/pgalloc.h>
++#include <asm/pat.h>
++
++#include "physaddr.h"
++
++static int direct_remap_area_pte_fn(pte_t *pte,
++                                  struct page *pmd_page,
++                                  unsigned long address,
++                                  void *data)
++{
++      mmu_update_t **v = (mmu_update_t **)data;
++
++      BUG_ON(!pte_none(*pte));
++
++      (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
++                   PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
++      (*v)++;
++
++      return 0;
++}
++
++static int __direct_remap_pfn_range(struct mm_struct *mm,
++                                  unsigned long address,
++                                  phys_addr_t mfn,
++                                  unsigned long size,
++                                  pgprot_t prot,
++                                  domid_t  domid)
++{
++      int rc = 0;
++      unsigned long i, start_address;
++      mmu_update_t *u, *v, *w;
++
++      u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
++      if (u == NULL)
++              return -ENOMEM;
++
++      start_address = address;
++
++      flush_cache_all();
++
++      for (i = 0; i < size; i += PAGE_SIZE) {
++              if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
++                      /* Flush a full batch after filling in the PTE ptrs. */
++                      rc = apply_to_page_range(mm, start_address,
++                                               address - start_address,
++                                               direct_remap_area_pte_fn, &w);
++                      if (rc)
++                              goto out;
++                      rc = HYPERVISOR_mmu_update(u, v - u, NULL, domid);
++                      if (rc < 0)
++                              goto out;
++                      v = w = u;
++                      start_address = address;
++              }
++
++              /*
++               * Fill in the machine address: PTE ptr is done later by
++               * apply_to_page_range().
++               */
++              pgprot_val(prot) |= _PAGE_IOMAP;
++              v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot)));
++
++              mfn++;
++              address += PAGE_SIZE;
++              v++;
++      }
++
++      if (v != u) {
++              /* Final batch. */
++              rc = apply_to_page_range(mm, start_address,
++                                       address - start_address,
++                                       direct_remap_area_pte_fn, &w);
++              if (rc)
++                      goto out;
++              rc = HYPERVISOR_mmu_update(u, v - u, NULL, domid);
++      }
++
++ out:
++      flush_tlb_all();
++
++      free_page((unsigned long)u);
++
++      return rc;
++}
++
++int direct_remap_pfn_range(struct vm_area_struct *vma,
++                         unsigned long address,
++                         phys_addr_t mfn,
++                         unsigned long size,
++                         pgprot_t prot,
++                         domid_t  domid)
++{
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              return remap_pfn_range(vma, address, mfn, size, prot);
++
++      if (domid == DOMID_SELF)
++              return -EINVAL;
++
++      vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
++
++      vma->vm_mm->context.has_foreign_mappings = 1;
++
++      return __direct_remap_pfn_range(
++              vma->vm_mm, address, mfn, size, prot, domid);
++}
++EXPORT_SYMBOL(direct_remap_pfn_range);
++
++int direct_kernel_remap_pfn_range(unsigned long address,
++                                unsigned long mfn,
++                                unsigned long size,
++                                pgprot_t prot,
++                                domid_t  domid)
++{
++      return __direct_remap_pfn_range(
++              &init_mm, address, mfn, size, prot, domid);
++}
++EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
++
++static int lookup_pte_fn(
++      pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
++{
++      uint64_t *ptep = (uint64_t *)data;
++      if (ptep)
++              *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
++                       PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
++      return 0;
++}
++
++int create_lookup_pte_addr(struct mm_struct *mm,
++                         unsigned long address,
++                         uint64_t *ptep)
++{
++      return apply_to_page_range(mm, address, PAGE_SIZE,
++                                 lookup_pte_fn, ptep);
++}
++
++EXPORT_SYMBOL(create_lookup_pte_addr);
++
++#ifdef CONFIG_MODULES
++/*
++ * Force the implementation of ioremap_page_range() to be pulled in from
++ * lib/lib.a even if there is no other reference from the core kernel to it
++ * (native uses it in __ioremap_caller()), so that it gets exported.
++ */
++static void *const __section(.discard.ioremap) __used
++_ioremap_page_range = ioremap_page_range;
++#endif
++
++/*
++ * Fix up the linear direct mapping of the kernel to avoid cache attribute
++ * conflicts.
++ */
++static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
++                             unsigned long prot_val)
++{
++      unsigned long nrpages = size >> PAGE_SHIFT;
++      int err;
++
++      switch (prot_val) {
++      case _PAGE_CACHE_UC:
++      default:
++              err = _set_memory_uc(vaddr, nrpages);
++              break;
++      case _PAGE_CACHE_WC:
++              err = _set_memory_wc(vaddr, nrpages);
++              break;
++      case _PAGE_CACHE_WB:
++              err = _set_memory_wb(vaddr, nrpages);
++              break;
++      }
++
++      return err;
++}
++
++int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
++                            unsigned long prot_val)
++{
++      unsigned long sz;
++      int rc;
++
++      for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
++              unsigned long pfn = mfn_to_local_pfn(mfn);
++
++              if (pfn >= max_low_pfn_mapped &&
++                  (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
++                      continue;
++              rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
++                                       PAGE_SIZE, prot_val);
++      }
++
++      return rc;
++}
++
++/*
++ * Remap an arbitrary physical address space into the kernel virtual
++ * address space. Needed when the kernel wants to access high addresses
++ * directly.
++ *
++ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
++ * have to convert them into an offset in a page-aligned mapping, but the
++ * caller shouldn't need to know that small detail.
++ */
++static void __iomem *__ioremap_caller(resource_size_t phys_addr,
++              unsigned long size, unsigned long prot_val, void *caller)
++{
++      unsigned long offset, vaddr;
++      phys_addr_t mfn, last_mfn, last_addr;
++      const resource_size_t unaligned_phys_addr = phys_addr;
++      const unsigned long unaligned_size = size;
++      struct vm_struct *area;
++      unsigned long new_prot_val;
++      pgprot_t prot;
++      int retval;
++      domid_t domid = DOMID_IO;
++      void __iomem *ret_addr;
++
++      /* Don't allow wraparound or zero size */
++      last_addr = phys_addr + size - 1;
++      if (!size || last_addr < phys_addr)
++              return NULL;
++
++      if (!phys_addr_valid(phys_addr)) {
++              printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
++                     (unsigned long long)phys_addr);
++              WARN_ON_ONCE(1);
++              return NULL;
++      }
++
++      /*
++       * Don't remap the low PCI/ISA area, it's always mapped..
++       */
++      if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
++              return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
++
++      /*
++       * Check if the request spans more than any BAR in the iomem resource
++       * tree.
++       */
++      WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
++                KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
++
++      /*
++       * Don't allow anybody to remap normal RAM that we're using..
++       */
++      last_mfn = PFN_DOWN(last_addr);
++      for (mfn = PFN_DOWN(phys_addr); mfn <= last_mfn; mfn++) {
++              unsigned long pfn = mfn_to_local_pfn(mfn);
++
++              if (pfn_valid(pfn)) {
++                      if (!PageReserved(pfn_to_page(pfn)))
++                              return NULL;
++                      domid = DOMID_SELF;
++              }
++      }
++      WARN_ON_ONCE(domid == DOMID_SELF);
++
++      /*
++       * Mappings have to be page-aligned
++       */
++      offset = phys_addr & ~PAGE_MASK;
++      phys_addr &= PHYSICAL_PAGE_MASK;
++      size = PAGE_ALIGN(last_addr+1) - phys_addr;
++
++      retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
++                                              prot_val, &new_prot_val);
++      if (retval) {
++              printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
++              return NULL;
++      }
++
++      if (prot_val != new_prot_val) {
++              if (!is_new_memtype_allowed(phys_addr, size,
++                                          prot_val, new_prot_val)) {
++                      printk(KERN_ERR
++              "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
++                              (unsigned long long)phys_addr,
++                              (unsigned long long)(phys_addr + size),
++                              prot_val, new_prot_val);
++                      goto err_free_memtype;
++              }
++              prot_val = new_prot_val;
++      }
++
++      switch (prot_val) {
++      case _PAGE_CACHE_UC:
++      default:
++              prot = PAGE_KERNEL_IO_NOCACHE;
++              break;
++      case _PAGE_CACHE_UC_MINUS:
++              prot = PAGE_KERNEL_IO_UC_MINUS;
++              break;
++      case _PAGE_CACHE_WC:
++              prot = PAGE_KERNEL_IO_WC;
++              break;
++      case _PAGE_CACHE_WB:
++              prot = PAGE_KERNEL_IO;
++              break;
++      }
++
++      /*
++       * Ok, go for it..
++       */
++      area = get_vm_area_caller(size, VM_IOREMAP, caller);
++      if (!area)
++              goto err_free_memtype;
++      area->phys_addr = phys_addr;
++      vaddr = (unsigned long) area->addr;
++
++      if (kernel_map_sync_memtype(phys_addr, size, prot_val))
++              goto err_free_area;
++
++      if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
++                                   size, prot, domid))
++              goto err_free_area;
++
++      ret_addr = (void __iomem *) (vaddr + offset);
++      mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
++
++      return ret_addr;
++err_free_area:
++      free_vm_area(area);
++err_free_memtype:
++      free_memtype(phys_addr, phys_addr + size);
++      return NULL;
++}
++
++/**
++ * ioremap_nocache     -   map bus memory into CPU space
++ * @offset:    bus address of the memory
++ * @size:      size of the resource to map
++ *
++ * ioremap_nocache performs a platform specific sequence of operations to
++ * make bus memory CPU accessible via the readb/readw/readl/writeb/
++ * writew/writel functions and the other mmio helpers. The returned
++ * address is not guaranteed to be usable directly as a virtual
++ * address.
++ *
++ * This version of ioremap ensures that the memory is marked uncachable
++ * on the CPU as well as honouring existing caching rules from things like
++ * the PCI bus. Note that there are other caches and buffers on many
++ * busses. In particular driver authors should read up on PCI writes
++ *
++ * It's useful if some control registers are in such an area and
++ * write combining or read caching is not desirable:
++ *
++ * Must be freed with iounmap.
++ */
++void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
++{
++      /*
++       * Ideally, this should be:
++       *      pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
++       *
++       * Till we fix all X drivers to use ioremap_wc(), we will use
++       * UC MINUS.
++       */
++      unsigned long val = _PAGE_CACHE_UC_MINUS;
++
++      return __ioremap_caller(phys_addr, size, val,
++                              __builtin_return_address(0));
++}
++EXPORT_SYMBOL(ioremap_nocache);
++
++/**
++ * ioremap_wc -       map memory into CPU space write combined
++ * @offset:   bus address of the memory
++ * @size:     size of the resource to map
++ *
++ * This version of ioremap ensures that the memory is marked write combining.
++ * Write combining allows faster writes to some hardware devices.
++ *
++ * Must be freed with iounmap.
++ */
++void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
++{
++      if (pat_enabled)
++              return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
++                                      __builtin_return_address(0));
++      else
++              return ioremap_nocache(phys_addr, size);
++}
++EXPORT_SYMBOL(ioremap_wc);
++
++void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
++{
++      return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
++                              __builtin_return_address(0));
++}
++EXPORT_SYMBOL(ioremap_cache);
++
++void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
++                              unsigned long prot_val)
++{
++      return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
++                              __builtin_return_address(0));
++}
++EXPORT_SYMBOL(ioremap_prot);
++
++/**
++ * iounmap - Free a IO remapping
++ * @addr: virtual address from ioremap_*
++ *
++ * Caller must ensure there is only one unmapping for the same pointer.
++ */
++void iounmap(volatile void __iomem *addr)
++{
++      struct vm_struct *p, *o;
++
++      if ((void __force *)addr <= high_memory)
++              return;
++
++      /*
++       * __ioremap special-cases the PCI/ISA range by not instantiating a
++       * vm_area and by simply returning an address into the kernel mapping
++       * of ISA space.   So handle that here.
++       */
++      if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
++              return;
++
++      addr = (volatile void __iomem *)
++              (PAGE_MASK & (unsigned long __force)addr);
++
++      mmiotrace_iounmap(addr);
++
++      /* Use the vm area unlocked, assuming the caller
++         ensures there isn't another iounmap for the same address
++         in parallel. Reuse of the virtual address is prevented by
++         leaving it in the global lists until we're done with it.
++         cpa takes care of the direct mappings. */
++      read_lock(&vmlist_lock);
++      for (p = vmlist; p; p = p->next) {
++              if (p->addr == (void __force *)addr)
++                      break;
++      }
++      read_unlock(&vmlist_lock);
++
++      if (!p) {
++              printk(KERN_ERR "iounmap: bad address %p\n", addr);
++              dump_stack();
++              return;
++      }
++
++      free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
++
++      /* Finally remove it */
++      o = remove_vm_area((void __force *)addr);
++      BUG_ON(p != o || o == NULL);
++      kfree(p);
++}
++EXPORT_SYMBOL(iounmap);
++
++#ifndef CONFIG_XEN
++/*
++ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
++ * access
++ */
++void *xlate_dev_mem_ptr(unsigned long phys)
++{
++      void *addr;
++      unsigned long start = phys & PAGE_MASK;
++
++      /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
++      if (page_is_ram(start >> PAGE_SHIFT))
++              return __va(phys);
++
++      addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
++      if (addr)
++              addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
++
++      return addr;
++}
++
++void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
++{
++      if (page_is_ram(phys >> PAGE_SHIFT))
++              return;
++
++      iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
++      return;
++}
++#endif
++
++static int __initdata early_ioremap_debug;
++
++static int __init early_ioremap_debug_setup(char *str)
++{
++      early_ioremap_debug = 1;
++
++      return 0;
++}
++early_param("early_ioremap_debug", early_ioremap_debug_setup);
++
++static __initdata int after_paging_init;
++static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
++
++#ifdef CONFIG_X86_32
++static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
++{
++      /* Don't assume we're using swapper_pg_dir at this point */
++      pgd_t *base = __va(read_cr3());
++      pgd_t *pgd = &base[pgd_index(addr)];
++      pud_t *pud = pud_offset(pgd, addr);
++      pmd_t *pmd = pmd_offset(pud, addr);
++
++      return pmd;
++}
++#else
++#define early_ioremap_pmd early_get_pmd
++#undef make_lowmem_page_readonly
++#define make_lowmem_page_readonly early_make_page_readonly
++#endif
++
++static inline pte_t * __init early_ioremap_pte(unsigned long addr)
++{
++      return &bm_pte[pte_index(addr)];
++}
++
++bool __init is_early_ioremap_ptep(pte_t *ptep)
++{
++      return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
++}
++
++static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
++
++void __init early_ioremap_init(void)
++{
++      pmd_t *pmd;
++      int i;
++
++      if (early_ioremap_debug)
++              printk(KERN_INFO "early_ioremap_init()\n");
++
++      for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
++              slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
++
++      pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
++      memset(bm_pte, 0, sizeof(bm_pte));
++      make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
++      pmd_populate_kernel(&init_mm, pmd, bm_pte);
++
++      /*
++       * The boot-ioremap range spans multiple pmds, for which
++       * we are not prepared:
++       */
++#define __FIXADDR_TOP (-PAGE_SIZE)
++      BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
++                   != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
++#undef __FIXADDR_TOP
++      if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
++              WARN_ON(1);
++              printk(KERN_WARNING "pmd %p != %p\n",
++                     pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
++              printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
++                      fix_to_virt(FIX_BTMAP_BEGIN));
++              printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END):   %08lx\n",
++                      fix_to_virt(FIX_BTMAP_END));
++
++              printk(KERN_WARNING "FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
++              printk(KERN_WARNING "FIX_BTMAP_BEGIN:     %d\n",
++                     FIX_BTMAP_BEGIN);
++      }
++}
++
++void __init early_ioremap_reset(void)
++{
++      after_paging_init = 1;
++}
++
++static void __init __early_set_fixmap(enum fixed_addresses idx,
++                                    phys_addr_t phys, pgprot_t flags)
++{
++      unsigned long addr = __fix_to_virt(idx);
++      pte_t *pte;
++
++      if (idx >= __end_of_fixed_addresses) {
++              BUG();
++              return;
++      }
++      pte = early_ioremap_pte(addr);
++
++      if (pgprot_val(flags))
++              set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
++      else
++              pte_clear(&init_mm, addr, pte);
++      __flush_tlb_one(addr);
++}
++
++static inline void __init early_set_fixmap(enum fixed_addresses idx,
++                                         phys_addr_t phys, pgprot_t prot)
++{
++      if (after_paging_init)
++              __set_fixmap(idx, phys, prot);
++      else
++              __early_set_fixmap(idx, phys, prot);
++}
++
++static inline void __init early_clear_fixmap(enum fixed_addresses idx)
++{
++      if (after_paging_init)
++              clear_fixmap(idx);
++      else
++              __early_set_fixmap(idx, 0, __pgprot(0));
++}
++
++static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
++static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
++
++#ifndef CONFIG_XEN
++void __init fixup_early_ioremap(void)
++{
++      int i;
++
++      for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
++              if (prev_map[i]) {
++                      WARN_ON(1);
++                      break;
++              }
++      }
++
++      early_ioremap_init();
++}
++#endif
++
++static int __init check_early_ioremap_leak(void)
++{
++      int count = 0;
++      int i;
++
++      for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
++              if (prev_map[i])
++                      count++;
++
++      if (!count)
++              return 0;
++      WARN(1, KERN_WARNING
++             "Debug warning: early ioremap leak of %d areas detected.\n",
++              count);
++      printk(KERN_WARNING
++              "please boot with early_ioremap_debug and report the dmesg.\n");
++
++      return 1;
++}
++late_initcall(check_early_ioremap_leak);
++
++static void __init __iomem *
++__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
++{
++      unsigned long offset;
++      resource_size_t last_addr;
++      unsigned int nrpages;
++      enum fixed_addresses idx0, idx;
++      int i, slot;
++
++      WARN_ON(system_state != SYSTEM_BOOTING);
++
++      slot = -1;
++      for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
++              if (!prev_map[i]) {
++                      slot = i;
++                      break;
++              }
++      }
++
++      if (slot < 0) {
++              printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n",
++                       (u64)phys_addr, size);
++              WARN_ON(1);
++              return NULL;
++      }
++
++      if (early_ioremap_debug) {
++              printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ",
++                     (u64)phys_addr, size, slot);
++              dump_stack();
++      }
++
++      /* Don't allow wraparound or zero size */
++      last_addr = phys_addr + size - 1;
++      if (!size || last_addr < phys_addr) {
++              WARN_ON(1);
++              return NULL;
++      }
++
++      prev_size[slot] = size;
++      /*
++       * Mappings have to be page-aligned
++       */
++      offset = phys_addr & ~PAGE_MASK;
++      phys_addr &= PAGE_MASK;
++      size = PAGE_ALIGN(last_addr + 1) - phys_addr;
++
++      /*
++       * Mappings have to fit in the FIX_BTMAP area.
++       */
++      nrpages = size >> PAGE_SHIFT;
++      if (nrpages > NR_FIX_BTMAPS) {
++              WARN_ON(1);
++              return NULL;
++      }
++
++      /*
++       * Ok, go for it..
++       */
++      idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
++      idx = idx0;
++      while (nrpages > 0) {
++              early_set_fixmap(idx, phys_addr, prot);
++              phys_addr += PAGE_SIZE;
++              --idx;
++              --nrpages;
++      }
++      if (early_ioremap_debug)
++              printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
++
++      prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
++      return prev_map[slot];
++}
++
++/* Remap an IO device */
++void __init __iomem *
++early_ioremap(resource_size_t phys_addr, unsigned long size)
++{
++      /*
++       * Don't remap the low PCI/ISA area, it's always mapped.
++       */
++      if (is_initial_xendomain() && is_ISA_range(phys_addr, phys_addr + size - 1))
++              return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
++
++      return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
++}
++
++/* Remap memory */
++void __init __iomem *
++early_memremap(resource_size_t phys_addr, unsigned long size)
++{
++      return __early_ioremap(phys_to_machine(phys_addr), size, PAGE_KERNEL);
++}
++
++void __init __iomem *
++early_memremap_ro(resource_size_t phys_addr, unsigned long size)
++{
++      return __early_ioremap(phys_to_machine(phys_addr), size, PAGE_KERNEL_RO);
++}
++
++void __init early_iounmap(void __iomem *addr, unsigned long size)
++{
++      unsigned long virt_addr;
++      unsigned long offset;
++      unsigned int nrpages;
++      enum fixed_addresses idx;
++      int i, slot;
++
++      /*
++       * early_ioremap special-cases the PCI/ISA range by not instantiating a
++       * vm_area and by simply returning an address into the kernel mapping
++       * of ISA space.   So handle that here.
++       */
++      if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN)
++          && (unsigned long)addr < fix_to_virt(FIX_ISAMAP_END - 1))
++              return;
++
++      slot = -1;
++      for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
++              if (prev_map[i] == addr) {
++                      slot = i;
++                      break;
++              }
++      }
++
++      if (slot < 0) {
++              printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
++                       addr, size);
++              WARN_ON(1);
++              return;
++      }
++
++      if (prev_size[slot] != size) {
++              printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
++                       addr, size, slot, prev_size[slot]);
++              WARN_ON(1);
++              return;
++      }
++
++      if (early_ioremap_debug) {
++              printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
++                     size, slot);
++              dump_stack();
++      }
++
++      virt_addr = (unsigned long)addr;
++      if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
++              WARN_ON(1);
++              return;
++      }
++      offset = virt_addr & ~PAGE_MASK;
++      nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
++
++      idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
++      while (nrpages > 0) {
++              early_clear_fixmap(idx);
++              --idx;
++              --nrpages;
++      }
++      prev_map[slot] = NULL;
++}
diff --cc arch/x86/mm/memblock.c

index aa11693,aa11693..0493bcf
--- 1/arch/x86/mm/memblock.c
--- 2/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@@ -293,6 -293,6 +293,11 @@@ static int __init memblock_x86_find_act
   {
         u64 align = PAGE_SIZE;
   
++#ifdef CONFIG_XEN
++      if (last_pfn > xen_start_info->nr_pages)
++              last_pfn = xen_start_info->nr_pages;
++#endif
++
         *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
         *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
   
@@@ -325,6 -325,6 +330,11 @@@ void __init memblock_x86_register_activ
                 if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
                                            &ei_startpfn, &ei_endpfn))
                         add_active_range(nid, ei_startpfn, ei_endpfn);
++
++#ifdef CONFIG_XEN
++      BUG_ON(nid);
++      add_active_range(nid, last_pfn, last_pfn);
++#endif
   }
   
   /*
diff --cc arch/x86/mm/pageattr-xen.c

index 0000000,0000000..5c10179

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/pageattr-xen.c
@@@ -1,0 -1,0 +1,1551 @@@
++/*
++ * Copyright 2002 Andi Kleen, SuSE Labs.
++ * Thanks to Ben LaHaise for precious feedback.
++ */
++#include <linux/highmem.h>
++#include <linux/bootmem.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/interrupt.h>
++#include <linux/seq_file.h>
++#include <linux/debugfs.h>
++#include <linux/pfn.h>
++#include <linux/percpu.h>
++#include <linux/gfp.h>
++#include <linux/pci.h>
++
++#include <asm/e820.h>
++#include <asm/processor.h>
++#include <asm/tlbflush.h>
++#include <asm/sections.h>
++#include <asm/setup.h>
++#include <asm/uaccess.h>
++#include <asm/pgalloc.h>
++#include <asm/proto.h>
++#include <asm/pat.h>
++
++/*
++ * The current flushing context - we pass it instead of 5 arguments:
++ */
++struct cpa_data {
++      unsigned long   *vaddr;
++      pgprot_t        mask_set;
++      pgprot_t        mask_clr;
++      int             numpages;
++      int             flags;
++      unsigned long   pfn;
++      unsigned        force_split : 1;
++      int             curpage;
++      struct page     **pages;
++};
++
++/*
++ * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
++ * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
++ * entries change the page attribute in parallel to some other cpu
++ * splitting a large page entry along with changing the attribute.
++ */
++static DEFINE_SPINLOCK(cpa_lock);
++
++#define CPA_FLUSHTLB 1
++#define CPA_ARRAY 2
++#define CPA_PAGES_ARRAY 4
++
++#ifdef CONFIG_PROC_FS
++static unsigned long direct_pages_count[PG_LEVEL_NUM];
++
++void update_page_count(int level, unsigned long pages)
++{
++      /* Protect against CPA */
++      spin_lock(&pgd_lock);
++      direct_pages_count[level] += pages;
++      spin_unlock(&pgd_lock);
++}
++
++static void split_page_count(int level)
++{
++      direct_pages_count[level]--;
++      direct_pages_count[level - 1] += PTRS_PER_PTE;
++}
++
++void arch_report_meminfo(struct seq_file *m)
++{
++      seq_printf(m, "DirectMap4k:    %8lu kB\n",
++                      direct_pages_count[PG_LEVEL_4K] << 2);
++#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
++      seq_printf(m, "DirectMap2M:    %8lu kB\n",
++                      direct_pages_count[PG_LEVEL_2M] << 11);
++#else
++      seq_printf(m, "DirectMap4M:    %8lu kB\n",
++                      direct_pages_count[PG_LEVEL_2M] << 12);
++#endif
++#ifdef CONFIG_X86_64
++      if (direct_gbpages)
++              seq_printf(m, "DirectMap1G:    %8lu kB\n",
++                      direct_pages_count[PG_LEVEL_1G] << 20);
++#endif
++}
++#else
++static inline void split_page_count(int level) { }
++#endif
++
++#ifdef CONFIG_X86_64
++
++static inline unsigned long highmap_start_pfn(void)
++{
++      return __pa(_text) >> PAGE_SHIFT;
++}
++
++static inline unsigned long highmap_end_pfn(void)
++{
++      return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
++}
++
++#endif
++
++#ifdef CONFIG_DEBUG_PAGEALLOC
++# define debug_pagealloc 1
++#else
++# define debug_pagealloc 0
++#endif
++
++static inline int
++within(unsigned long addr, unsigned long start, unsigned long end)
++{
++      return addr >= start && addr < end;
++}
++
++/*
++ * Flushing functions
++ */
++
++/**
++ * clflush_cache_range - flush a cache range with clflush
++ * @addr:     virtual start address
++ * @size:     number of bytes to flush
++ *
++ * clflush is an unordered instruction which needs fencing with mfence
++ * to avoid ordering issues.
++ */
++void clflush_cache_range(void *vaddr, unsigned int size)
++{
++      void *vend = vaddr + size - 1;
++
++      mb();
++
++      for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
++              clflush(vaddr);
++      /*
++       * Flush any possible final partial cacheline:
++       */
++      clflush(vend);
++
++      mb();
++}
++EXPORT_SYMBOL_GPL(clflush_cache_range);
++
++static void __cpa_flush_all(void *arg)
++{
++      unsigned long cache = (unsigned long)arg;
++
++      /*
++       * Flush all to work around Errata in early athlons regarding
++       * large page flushing.
++       */
++      __flush_tlb_all();
++
++      if (cache && boot_cpu_data.x86 >= 4)
++              wbinvd();
++}
++
++static void cpa_flush_all(unsigned long cache)
++{
++      BUG_ON(irqs_disabled());
++
++      on_each_cpu(__cpa_flush_all, (void *) cache, 1);
++}
++
++static void __cpa_flush_range(void *arg)
++{
++      /*
++       * We could optimize that further and do individual per page
++       * tlb invalidates for a low number of pages. Caveat: we must
++       * flush the high aliases on 64bit as well.
++       */
++      __flush_tlb_all();
++}
++
++static void cpa_flush_range(unsigned long start, int numpages, int cache)
++{
++      unsigned int i, level;
++      unsigned long addr;
++
++      BUG_ON(irqs_disabled());
++      WARN_ON(PAGE_ALIGN(start) != start);
++
++      on_each_cpu(__cpa_flush_range, NULL, 1);
++
++      if (!cache)
++              return;
++
++      /*
++       * We only need to flush on one CPU,
++       * clflush is a MESI-coherent instruction that
++       * will cause all other CPUs to flush the same
++       * cachelines:
++       */
++      for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
++              pte_t *pte = lookup_address(addr, &level);
++
++              /*
++               * Only flush present addresses:
++               */
++              if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
++                      clflush_cache_range((void *) addr, PAGE_SIZE);
++      }
++}
++
++static void cpa_flush_array(unsigned long *start, int numpages, int cache,
++                          int in_flags, struct page **pages)
++{
++      unsigned int i, level;
++      unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
++
++      BUG_ON(irqs_disabled());
++
++      on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
++
++      if (!cache || do_wbinvd)
++              return;
++
++      /*
++       * We only need to flush on one CPU,
++       * clflush is a MESI-coherent instruction that
++       * will cause all other CPUs to flush the same
++       * cachelines:
++       */
++      for (i = 0; i < numpages; i++) {
++              unsigned long addr;
++              pte_t *pte;
++
++              if (in_flags & CPA_PAGES_ARRAY)
++                      addr = (unsigned long)page_address(pages[i]);
++              else
++                      addr = start[i];
++
++              pte = lookup_address(addr, &level);
++
++              /*
++               * Only flush present addresses:
++               */
++              if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
++                      clflush_cache_range((void *)addr, PAGE_SIZE);
++      }
++}
++
++/*
++ * Certain areas of memory on x86 require very specific protection flags,
++ * for example the BIOS area or kernel text. Callers don't always get this
++ * right (again, ioremap() on BIOS memory is not uncommon) so this function
++ * checks and fixes these known static required protection bits.
++ */
++static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
++                                 unsigned long pfn)
++{
++      pgprot_t forbidden = __pgprot(0);
++
++      /*
++       * The BIOS area between 640k and 1Mb needs to be executable for
++       * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
++       */
++#ifdef CONFIG_PCI_BIOS
++      if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
++              pgprot_val(forbidden) |= _PAGE_NX;
++#endif
++
++      /*
++       * The kernel text needs to be executable for obvious reasons
++       * Does not cover __inittext since that is gone later on. On
++       * 64bit we do not enforce !NX on the low mapping
++       */
++      if (within(address, (unsigned long)_text, (unsigned long)_etext))
++              pgprot_val(forbidden) |= _PAGE_NX;
++
++      /*
++       * The .rodata section needs to be read-only. Using the pfn
++       * catches all aliases.
++       */
++      if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
++                 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
++              pgprot_val(forbidden) |= _PAGE_RW;
++
++#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && !defined(CONFIG_XEN)
++      /*
++       * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
++       * kernel text mappings for the large page aligned text, rodata sections
++       * will be always read-only. For the kernel identity mappings covering
++       * the holes caused by this alignment can be anything that user asks.
++       *
++       * This will preserve the large page mappings for kernel text/data
++       * at no extra cost.
++       */
++      if (kernel_set_to_readonly &&
++          within(address, (unsigned long)_text,
++                 (unsigned long)__end_rodata_hpage_align)) {
++              unsigned int level;
++
++              /*
++               * Don't enforce the !RW mapping for the kernel text mapping,
++               * if the current mapping is already using small page mapping.
++               * No need to work hard to preserve large page mappings in this
++               * case.
++               *
++               * This also fixes the Linux Xen paravirt guest boot failure
++               * (because of unexpected read-only mappings for kernel identity
++               * mappings). In this paravirt guest case, the kernel text
++               * mapping and the kernel identity mapping share the same
++               * page-table pages. Thus we can't really use different
++               * protections for the kernel text and identity mappings. Also,
++               * these shared mappings are made of small page mappings.
++               * Thus this don't enforce !RW mapping for small page kernel
++               * text mapping logic will help Linux Xen parvirt guest boot
++               * as well.
++               */
++              if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
++                      pgprot_val(forbidden) |= _PAGE_RW;
++      }
++#endif
++
++      prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
++
++      return prot;
++}
++
++/*
++ * Lookup the page table entry for a virtual address. Return a pointer
++ * to the entry and the level of the mapping.
++ *
++ * Note: We return pud and pmd either when the entry is marked large
++ * or when the present bit is not set. Otherwise we would return a
++ * pointer to a nonexisting mapping.
++ */
++pte_t *lookup_address(unsigned long address, unsigned int *level)
++{
++      pgd_t *pgd = pgd_offset_k(address);
++      pud_t *pud;
++      pmd_t *pmd;
++
++      *level = PG_LEVEL_NONE;
++
++      if (pgd_none(*pgd))
++              return NULL;
++
++      pud = pud_offset(pgd, address);
++      if (pud_none(*pud))
++              return NULL;
++
++      *level = PG_LEVEL_1G;
++      if (pud_large(*pud) || !pud_present(*pud))
++              return (pte_t *)pud;
++
++      pmd = pmd_offset(pud, address);
++      if (pmd_none(*pmd))
++              return NULL;
++
++      *level = PG_LEVEL_2M;
++      if (pmd_large(*pmd) || !pmd_present(*pmd))
++              return (pte_t *)pmd;
++
++      *level = PG_LEVEL_4K;
++
++      return pte_offset_kernel(pmd, address);
++}
++EXPORT_SYMBOL_GPL(lookup_address);
++
++/*
++ * Set the new pmd in all the pgds we know about:
++ */
++static void __set_pmd_pte(pte_t *kpte, unsigned long address,
++                        unsigned int level, pte_t pte)
++{
++      /* change init_mm */
++      switch(level) {
++      case PG_LEVEL_2M:
++              xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
++              break;
++#ifdef CONFIG_X86_64
++      case PG_LEVEL_1G:
++              xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
++              break;
++#endif
++      default:
++              BUG();
++      }
++#ifdef CONFIG_X86_32
++      if (!SHARED_KERNEL_PMD) {
++              struct page *page;
++
++              list_for_each_entry(page, &pgd_list, lru) {
++                      pgd_t *pgd;
++                      pud_t *pud;
++                      pmd_t *pmd;
++
++                      pgd = (pgd_t *)page_address(page) + pgd_index(address);
++                      pud = pud_offset(pgd, address);
++                      pmd = pmd_offset(pud, address);
++                      xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
++              }
++      }
++#endif
++}
++
++static int
++try_preserve_large_page(pte_t *kpte, unsigned long address,
++                      struct cpa_data *cpa)
++{
++      unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
++      pte_t new_pte, old_pte, *tmp;
++      pgprot_t old_prot, new_prot, req_prot;
++      int i, do_split = 1;
++      unsigned int level;
++
++      if (cpa->force_split)
++              return 1;
++
++      spin_lock(&pgd_lock);
++      /*
++       * Check for races, another CPU might have split this page
++       * up already:
++       */
++      tmp = lookup_address(address, &level);
++      if (tmp != kpte)
++              goto out_unlock;
++
++      switch (level) {
++      case PG_LEVEL_2M:
++              psize = PMD_PAGE_SIZE;
++              pmask = PMD_PAGE_MASK;
++              break;
++#ifdef CONFIG_X86_64
++      case PG_LEVEL_1G:
++              psize = PUD_PAGE_SIZE;
++              pmask = PUD_PAGE_MASK;
++              break;
++#endif
++      default:
++              do_split = -EINVAL;
++              goto out_unlock;
++      }
++
++      /*
++       * Calculate the number of pages, which fit into this large
++       * page starting at address:
++       */
++      nextpage_addr = (address + psize) & pmask;
++      numpages = (nextpage_addr - address) >> PAGE_SHIFT;
++      if (numpages < cpa->numpages)
++              cpa->numpages = numpages;
++
++      /*
++       * We are safe now. Check whether the new pgprot is the same:
++       */
++      old_pte = *kpte;
++      old_prot = new_prot = req_prot = pte_pgprot(old_pte);
++
++      pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
++      pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
++
++      /*
++       * old_pte points to the large page base address. So we need
++       * to add the offset of the virtual address:
++       */
++      pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
++      cpa->pfn = pfn;
++
++      new_prot = static_protections(req_prot, address, pfn);
++
++      /*
++       * We need to check the full range, whether
++       * static_protection() requires a different pgprot for one of
++       * the pages in the range we try to preserve:
++       */
++      addr = address & pmask;
++      pfn = pte_pfn(old_pte);
++      for (i = 0; i < (psize >> PAGE_SHIFT) && pfn < max_mapnr;
++           i++, addr += PAGE_SIZE, pfn++) {
++              pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
++
++              if (pgprot_val(chk_prot) != pgprot_val(new_prot))
++                      goto out_unlock;
++      }
++
++      /*
++       * If there are no changes, return. maxpages has been updated
++       * above:
++       */
++      if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
++              do_split = 0;
++              goto out_unlock;
++      }
++
++      /*
++       * We need to change the attributes. Check, whether we can
++       * change the large page in one go. We request a split, when
++       * the address is not aligned and the number of pages is
++       * smaller than the number of pages in the large page. Note
++       * that we limited the number of possible pages already to
++       * the number of pages in the large page.
++       */
++      if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
++              /*
++               * The address is aligned and the number of pages
++               * covers the full page.
++               */
++              new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
++              __set_pmd_pte(kpte, address, level, new_pte);
++              cpa->flags |= CPA_FLUSHTLB;
++              do_split = 0;
++      }
++
++out_unlock:
++      spin_unlock(&pgd_lock);
++
++      return do_split;
++}
++
++static int split_large_page(pte_t *kpte, unsigned long address)
++{
++      unsigned long mfn, mfninc = 1;
++      unsigned int i, level;
++      pte_t *pbase, *tmp;
++      pgprot_t ref_prot;
++      struct page *base;
++
++      if (!debug_pagealloc)
++              spin_unlock(&cpa_lock);
++      base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
++      if (!debug_pagealloc)
++              spin_lock(&cpa_lock);
++      if (!base)
++              return -ENOMEM;
++
++      spin_lock(&pgd_lock);
++      /*
++       * Check for races, another CPU might have split this page
++       * up for us already:
++       */
++      tmp = lookup_address(address, &level);
++      if (tmp != kpte)
++              goto out_unlock;
++
++      pbase = (pte_t *)page_address(base);
++      paravirt_alloc_pte(&init_mm, page_to_pfn(base));
++      ref_prot = pte_pgprot(pte_clrhuge(*kpte));
++      /*
++       * If we ever want to utilize the PAT bit, we need to
++       * update this function to make sure it's converted from
++       * bit 12 to bit 7 when we cross from the 2MB level to
++       * the 4K level:
++       */
++      WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
++
++#ifdef CONFIG_X86_64
++      if (level == PG_LEVEL_1G) {
++              mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
++              pgprot_val(ref_prot) |= _PAGE_PSE;
++      }
++#endif
++
++      if (address >= (unsigned long)__va(0) &&
++              address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
++              split_page_count(level);
++
++#ifdef CONFIG_X86_64
++      if (address >= (unsigned long)__va(1UL<<32) &&
++              address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
++              split_page_count(level);
++#endif
++
++      /*
++       * Get the target mfn from the original entry:
++       */
++      mfn = __pte_mfn(*kpte);
++      for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
++              set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
++
++      /*
++       * Install the new, split up pagetable.
++       *
++       * We use the standard kernel pagetable protections for the new
++       * pagetable protections, the actual ptes set above control the
++       * primary protection behavior:
++       */
++      if (!xen_feature(XENFEAT_writable_page_tables) &&
++          HYPERVISOR_update_va_mapping((unsigned long)pbase,
++                                       mk_pte(base, PAGE_KERNEL_RO), 0))
++              BUG();
++      __set_pmd_pte(kpte, address, level, mk_pte(base, __pgprot(_KERNPG_TABLE)));
++
++      /*
++       * Intel Atom errata AAH41 workaround.
++       *
++       * The real fix should be in hw or in a microcode update, but
++       * we also probabilistically try to reduce the window of having
++       * a large TLB mixed with 4K TLBs while instruction fetches are
++       * going on.
++       */
++      __flush_tlb_all();
++
++      base = NULL;
++
++out_unlock:
++      /*
++       * If we dropped out via the lookup_address check under
++       * pgd_lock then stick the page back into the pool:
++       */
++      if (base)
++              __free_page(base);
++      spin_unlock(&pgd_lock);
++
++      return 0;
++}
++
++static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
++                             int primary)
++{
++      /*
++       * Ignore all non primary paths.
++       */
++      if (!primary)
++              return 0;
++
++      /*
++       * Ignore the NULL PTE for kernel identity mapping, as it is expected
++       * to have holes.
++       * Also set numpages to '1' indicating that we processed cpa req for
++       * one virtual address page and its pfn. TBD: numpages can be set based
++       * on the initial value and the level returned by lookup_address().
++       */
++      if (within(vaddr, PAGE_OFFSET,
++                 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
++              cpa->numpages = 1;
++              cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
++              return 0;
++      } else {
++              WARN(1, KERN_WARNING "CPA: called for zero pte. "
++                      "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
++                      *cpa->vaddr);
++
++              return -EFAULT;
++      }
++}
++
++static int __change_page_attr(struct cpa_data *cpa, int primary)
++{
++      unsigned long address;
++      int do_split, err;
++      unsigned int level;
++      pte_t *kpte, old_pte;
++
++      if (cpa->flags & CPA_PAGES_ARRAY) {
++              struct page *page = cpa->pages[cpa->curpage];
++              if (unlikely(PageHighMem(page)))
++                      return 0;
++              address = (unsigned long)page_address(page);
++      } else if (cpa->flags & CPA_ARRAY)
++              address = cpa->vaddr[cpa->curpage];
++      else
++              address = *cpa->vaddr;
++repeat:
++      kpte = lookup_address(address, &level);
++      if (!kpte)
++              return __cpa_process_fault(cpa, address, primary);
++
++      old_pte = *kpte;
++      if (!__pte_val(old_pte))
++              return __cpa_process_fault(cpa, address, primary);
++
++      if (level == PG_LEVEL_4K) {
++              pte_t new_pte;
++              pgprot_t new_prot = pte_pgprot(old_pte);
++              unsigned long mfn = __pte_mfn(old_pte);
++
++              pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
++              pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
++
++              new_prot = static_protections(new_prot, address,
++                                            mfn_to_local_pfn(mfn));
++
++              /*
++               * We need to keep the mfn from the existing PTE,
++               * after all we're only going to change it's attributes
++               * not the memory it points to
++               */
++              new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
++              cpa->pfn = mfn_to_local_pfn(mfn);
++              /*
++               * Do we really change anything ?
++               */
++              if (__pte_val(old_pte) != __pte_val(new_pte)) {
++                      mmu_update_t u;
++
++                      u.ptr = virt_to_machine(kpte);
++                      u.val = __pte_val(new_pte);
++                      WARN_ON_ONCE(arch_use_lazy_mmu_mode());
++                      do {
++                              err = HYPERVISOR_mmu_update(&u, 1, NULL,
++                                                          DOMID_SELF);
++                              switch (err) {
++                              case 0:
++                                      break;
++                              case -ENOMEM:
++                                      BUG_ON(!primary);
++                                      BUG_ON(!((pgprot_val(cpa->mask_set) |
++                                                pgprot_val(cpa->mask_clr)) &
++                                               _PAGE_CACHE_MASK));
++                                      if (hypervisor_oom())
++                                              continue;
++                                      /* fall through */
++                              default:
++                                      return err;
++                              }
++                      } while (err);
++                      cpa->flags |= CPA_FLUSHTLB;
++              }
++              cpa->numpages = 1;
++              return 0;
++      }
++
++      /*
++       * Check, whether we can keep the large page intact
++       * and just change the pte:
++       */
++      do_split = try_preserve_large_page(kpte, address, cpa);
++      /*
++       * When the range fits into the existing large page,
++       * return. cp->numpages and cpa->tlbflush have been updated in
++       * try_large_page:
++       */
++      if (do_split <= 0)
++              return do_split;
++
++      /*
++       * We have to split the large page:
++       */
++      err = split_large_page(kpte, address);
++      if (!err) {
++              /*
++               * Do a global flush tlb after splitting the large page
++               * and before we do the actual change page attribute in the PTE.
++               *
++               * With out this, we violate the TLB application note, that says
++               * "The TLBs may contain both ordinary and large-page
++               *  translations for a 4-KByte range of linear addresses. This
++               *  may occur if software modifies the paging structures so that
++               *  the page size used for the address range changes. If the two
++               *  translations differ with respect to page frame or attributes
++               *  (e.g., permissions), processor behavior is undefined and may
++               *  be implementation-specific."
++               *
++               * We do this global tlb flush inside the cpa_lock, so that we
++               * don't allow any other cpu, with stale tlb entries change the
++               * page attribute in parallel, that also falls into the
++               * just split large page entry.
++               */
++              flush_tlb_all();
++              goto repeat;
++      }
++
++      return err;
++}
++
++static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
++
++static int cpa_process_alias(struct cpa_data *cpa)
++{
++      struct cpa_data alias_cpa;
++      unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
++      unsigned long vaddr;
++      int ret;
++
++      if (cpa->pfn >= max_pfn_mapped)
++              return 0;
++
++#ifdef CONFIG_X86_64
++      if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
++              return 0;
++#endif
++      /*
++       * No need to redo, when the primary call touched the direct
++       * mapping already:
++       */
++      if (cpa->flags & CPA_PAGES_ARRAY) {
++              struct page *page = cpa->pages[cpa->curpage];
++              if (unlikely(PageHighMem(page)))
++                      return 0;
++              vaddr = (unsigned long)page_address(page);
++      } else if (cpa->flags & CPA_ARRAY)
++              vaddr = cpa->vaddr[cpa->curpage];
++      else
++              vaddr = *cpa->vaddr;
++
++      if (!(within(vaddr, PAGE_OFFSET,
++                  PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
++
++              alias_cpa = *cpa;
++              alias_cpa.vaddr = &laddr;
++              alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
++
++              ret = __change_page_attr_set_clr(&alias_cpa, 0);
++              if (ret)
++                      return ret;
++      }
++
++#ifdef CONFIG_X86_64
++      /*
++       * If the primary call didn't touch the high mapping already
++       * and the physical address is inside the kernel map, we need
++       * to touch the high mapped kernel as well:
++       */
++      if (!within(vaddr, (unsigned long)_text, _brk_end) &&
++          within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
++              unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
++                                             __START_KERNEL_map;
++              alias_cpa = *cpa;
++              alias_cpa.vaddr = &temp_cpa_vaddr;
++              alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
++
++              /*
++               * The high mapping range is imprecise, so ignore the
++               * return value.
++               */
++              __change_page_attr_set_clr(&alias_cpa, 0);
++      }
++#endif
++
++      return 0;
++}
++
++static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
++{
++      int ret, numpages = cpa->numpages;
++
++      while (numpages) {
++              /*
++               * Store the remaining nr of pages for the large page
++               * preservation check.
++               */
++              cpa->numpages = numpages;
++              /* for array changes, we can't use large page */
++              if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
++                      cpa->numpages = 1;
++
++              if (!debug_pagealloc)
++                      spin_lock(&cpa_lock);
++              ret = __change_page_attr(cpa, checkalias);
++              if (!debug_pagealloc)
++                      spin_unlock(&cpa_lock);
++              if (ret)
++                      return ret;
++
++              if (checkalias) {
++                      ret = cpa_process_alias(cpa);
++                      if (ret)
++                              return ret;
++              }
++
++              /*
++               * Adjust the number of pages with the result of the
++               * CPA operation. Either a large page has been
++               * preserved or a single page update happened.
++               */
++              BUG_ON(cpa->numpages > numpages);
++              numpages -= cpa->numpages;
++              if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
++                      cpa->curpage++;
++              else
++                      *cpa->vaddr += cpa->numpages * PAGE_SIZE;
++
++      }
++      return 0;
++}
++
++static inline int cache_attr(pgprot_t attr)
++{
++      return pgprot_val(attr) &
++              (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
++}
++
++static int change_page_attr_set_clr(unsigned long *addr, int numpages,
++                                  pgprot_t mask_set, pgprot_t mask_clr,
++                                  int force_split, int in_flag,
++                                  struct page **pages)
++{
++      struct cpa_data cpa;
++      int ret, cache, checkalias;
++      unsigned long baddr = 0;
++
++      /*
++       * Check, if we are requested to change a not supported
++       * feature:
++       */
++      mask_set = canon_pgprot(mask_set);
++      mask_clr = canon_pgprot(mask_clr);
++      if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
++              return 0;
++
++      /* Ensure we are PAGE_SIZE aligned */
++      if (in_flag & CPA_ARRAY) {
++              int i;
++              for (i = 0; i < numpages; i++) {
++                      if (addr[i] & ~PAGE_MASK) {
++                              addr[i] &= PAGE_MASK;
++                              WARN_ON_ONCE(1);
++                      }
++              }
++      } else if (!(in_flag & CPA_PAGES_ARRAY)) {
++              /*
++               * in_flag of CPA_PAGES_ARRAY implies it is aligned.
++               * No need to cehck in that case
++               */
++              if (*addr & ~PAGE_MASK) {
++                      *addr &= PAGE_MASK;
++                      /*
++                       * People should not be passing in unaligned addresses:
++                       */
++                      WARN_ON_ONCE(1);
++              }
++              /*
++               * Save address for cache flush. *addr is modified in the call
++               * to __change_page_attr_set_clr() below.
++               */
++              baddr = *addr;
++      }
++
++      /* Must avoid aliasing mappings in the highmem code */
++      kmap_flush_unused();
++
++      vm_unmap_aliases();
++
++      cpa.vaddr = addr;
++      cpa.pages = pages;
++      cpa.numpages = numpages;
++      cpa.mask_set = mask_set;
++      cpa.mask_clr = mask_clr;
++      cpa.flags = 0;
++      cpa.curpage = 0;
++      cpa.force_split = force_split;
++
++      if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
++              cpa.flags |= in_flag;
++
++      /* No alias checking for _NX bit modifications */
++      checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
++
++      ret = __change_page_attr_set_clr(&cpa, checkalias);
++
++      /*
++       * Check whether we really changed something:
++       */
++      if (!(cpa.flags & CPA_FLUSHTLB))
++              goto out;
++
++      /*
++       * No need to flush, when we did not set any of the caching
++       * attributes:
++       */
++      cache = cache_attr(mask_set);
++
++      /*
++       * On success we use clflush, when the CPU supports it to
++       * avoid the wbindv. If the CPU does not support it and in the
++       * error case we fall back to cpa_flush_all (which uses
++       * wbindv):
++       */
++      if (!ret && cpu_has_clflush) {
++              if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
++                      cpa_flush_array(addr, numpages, cache,
++                                      cpa.flags, pages);
++              } else
++                      cpa_flush_range(baddr, numpages, cache);
++      } else
++              cpa_flush_all(cache);
++
++out:
++      return ret;
++}
++
++static inline int change_page_attr_set(unsigned long *addr, int numpages,
++                                     pgprot_t mask, int array)
++{
++      return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
++              (array ? CPA_ARRAY : 0), NULL);
++}
++
++static inline int change_page_attr_clear(unsigned long *addr, int numpages,
++                                       pgprot_t mask, int array)
++{
++      return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
++              (array ? CPA_ARRAY : 0), NULL);
++}
++
++static inline int cpa_set_pages_array(struct page **pages, int numpages,
++                                     pgprot_t mask)
++{
++      return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
++              CPA_PAGES_ARRAY, pages);
++}
++
++static inline int cpa_clear_pages_array(struct page **pages, int numpages,
++                                       pgprot_t mask)
++{
++      return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
++              CPA_PAGES_ARRAY, pages);
++}
++
++#ifdef CONFIG_XEN
++static void _free_memtype(u64 pstart, u64 pend)
++{
++      u64 pa = pstart &= __PHYSICAL_MASK;
++      u64 ma = phys_to_machine(pa);
++
++      while ((pa += PAGE_SIZE) < pend) {
++              if (phys_to_machine(pa) != ma + (pa - pstart)) {
++                      free_memtype(ma, ma + (pa - pstart));
++                      pstart = pa;
++                      ma = phys_to_machine(pa);
++              }
++      }
++      free_memtype(ma, ma + (pend - pstart));
++}
++#define free_memtype _free_memtype
++
++static int _reserve_memtype(u64 pstart, u64 pend, unsigned long req_type)
++{
++      u64 pcur = pstart &= __PHYSICAL_MASK, pa = pcur;
++      u64 ma = phys_to_machine(pa);
++      int rc = 0;
++
++      while ((pa += PAGE_SIZE) < pend) {
++              if (phys_to_machine(pa) != ma + (pa - pcur)) {
++                      rc = reserve_memtype(ma, ma + (pa - pcur),
++                                           req_type, NULL);
++                      if (rc)
++                              break;
++                      pcur = pa;
++                      ma = phys_to_machine(pa);
++              }
++      }
++      if (likely(!rc))
++              rc = reserve_memtype(ma, ma + (pend - pcur), req_type, NULL);
++
++      if (unlikely(!rc) && pstart < pcur)
++              _free_memtype(pstart, pcur);
++
++      return rc;
++}
++#define reserve_memtype(s, e, r, n) \
++      _reserve_memtype(s, e, BUILD_BUG_ON_ZERO(n) ?: (r))
++#endif
++
++int _set_memory_uc(unsigned long addr, int numpages)
++{
++      /*
++       * for now UC MINUS. see comments in ioremap_nocache()
++       */
++      return change_page_attr_set(&addr, numpages,
++                                  __pgprot(_PAGE_CACHE_UC_MINUS), 0);
++}
++
++int set_memory_uc(unsigned long addr, int numpages)
++{
++      int ret;
++
++      /*
++       * for now UC MINUS. see comments in ioremap_nocache()
++       */
++      ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
++                          _PAGE_CACHE_UC_MINUS, NULL);
++      if (ret)
++              goto out_err;
++
++      ret = _set_memory_uc(addr, numpages);
++      if (ret)
++              goto out_free;
++
++      return 0;
++
++out_free:
++      free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
++out_err:
++      return ret;
++}
++EXPORT_SYMBOL(set_memory_uc);
++
++int _set_memory_array(unsigned long *addr, int addrinarray,
++              unsigned long new_type)
++{
++      int i, j;
++      int ret;
++
++      /*
++       * for now UC MINUS. see comments in ioremap_nocache()
++       */
++      for (i = 0; i < addrinarray; i++) {
++              ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
++                                      new_type, NULL);
++              if (ret)
++                      goto out_free;
++      }
++
++      ret = change_page_attr_set(addr, addrinarray,
++                                  __pgprot(_PAGE_CACHE_UC_MINUS), 1);
++
++      if (!ret && new_type == _PAGE_CACHE_WC)
++              ret = change_page_attr_set_clr(addr, addrinarray,
++                                             __pgprot(_PAGE_CACHE_WC),
++                                             __pgprot(_PAGE_CACHE_MASK),
++                                             0, CPA_ARRAY, NULL);
++      if (ret)
++              goto out_free;
++
++      return 0;
++
++out_free:
++      for (j = 0; j < i; j++)
++              free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
++
++      return ret;
++}
++
++int set_memory_array_uc(unsigned long *addr, int addrinarray)
++{
++      return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS);
++}
++EXPORT_SYMBOL(set_memory_array_uc);
++
++int set_memory_array_wc(unsigned long *addr, int addrinarray)
++{
++      return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC);
++}
++EXPORT_SYMBOL(set_memory_array_wc);
++
++int _set_memory_wc(unsigned long addr, int numpages)
++{
++      int ret;
++      unsigned long addr_copy = addr;
++
++      ret = change_page_attr_set(&addr, numpages,
++                                  __pgprot(_PAGE_CACHE_UC_MINUS), 0);
++      if (!ret) {
++              ret = change_page_attr_set_clr(&addr_copy, numpages,
++                                             __pgprot(_PAGE_CACHE_WC),
++                                             __pgprot(_PAGE_CACHE_MASK),
++                                             0, 0, NULL);
++      }
++      return ret;
++}
++
++int set_memory_wc(unsigned long addr, int numpages)
++{
++      int ret;
++
++      if (!pat_enabled)
++              return set_memory_uc(addr, numpages);
++
++      ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
++              _PAGE_CACHE_WC, NULL);
++      if (ret)
++              goto out_err;
++
++      ret = _set_memory_wc(addr, numpages);
++      if (ret)
++              goto out_free;
++
++      return 0;
++
++out_free:
++      free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
++out_err:
++      return ret;
++}
++EXPORT_SYMBOL(set_memory_wc);
++
++int _set_memory_wb(unsigned long addr, int numpages)
++{
++      return change_page_attr_clear(&addr, numpages,
++                                    __pgprot(_PAGE_CACHE_MASK), 0);
++}
++
++int set_memory_wb(unsigned long addr, int numpages)
++{
++      int ret;
++
++      ret = _set_memory_wb(addr, numpages);
++      if (ret)
++              return ret;
++
++      free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
++      return 0;
++}
++EXPORT_SYMBOL(set_memory_wb);
++
++int set_memory_array_wb(unsigned long *addr, int addrinarray)
++{
++      int i;
++      int ret;
++
++      ret = change_page_attr_clear(addr, addrinarray,
++                                    __pgprot(_PAGE_CACHE_MASK), 1);
++      if (ret)
++              return ret;
++
++      for (i = 0; i < addrinarray; i++)
++              free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
++
++      return 0;
++}
++EXPORT_SYMBOL(set_memory_array_wb);
++
++int set_memory_x(unsigned long addr, int numpages)
++{
++      if (!(__supported_pte_mask & _PAGE_NX))
++              return 0;
++
++      return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
++}
++EXPORT_SYMBOL(set_memory_x);
++
++int set_memory_nx(unsigned long addr, int numpages)
++{
++      if (!(__supported_pte_mask & _PAGE_NX))
++              return 0;
++
++      return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
++}
++EXPORT_SYMBOL(set_memory_nx);
++
++int set_memory_ro(unsigned long addr, int numpages)
++{
++      return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
++}
++EXPORT_SYMBOL_GPL(set_memory_ro);
++
++int set_memory_rw(unsigned long addr, int numpages)
++{
++      return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
++}
++EXPORT_SYMBOL_GPL(set_memory_rw);
++
++int set_memory_np(unsigned long addr, int numpages)
++{
++      return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
++}
++
++int set_memory_4k(unsigned long addr, int numpages)
++{
++      return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
++                                      __pgprot(0), 1, 0, NULL);
++}
++
++int set_pages_uc(struct page *page, int numpages)
++{
++      unsigned long addr = (unsigned long)page_address(page);
++
++      return set_memory_uc(addr, numpages);
++}
++EXPORT_SYMBOL(set_pages_uc);
++
++static int _set_pages_array(struct page **pages, int addrinarray,
++              unsigned long new_type)
++{
++      unsigned long start;
++      unsigned long end;
++      int i;
++      int free_idx;
++      int ret;
++
++      for (i = 0; i < addrinarray; i++) {
++              if (PageHighMem(pages[i]))
++                      continue;
++              start = page_to_pfn(pages[i]) << PAGE_SHIFT;
++              end = start + PAGE_SIZE;
++              if (reserve_memtype(start, end, new_type, NULL))
++                      goto err_out;
++      }
++
++      ret = cpa_set_pages_array(pages, addrinarray,
++                      __pgprot(_PAGE_CACHE_UC_MINUS));
++      if (!ret && new_type == _PAGE_CACHE_WC)
++              ret = change_page_attr_set_clr(NULL, addrinarray,
++                                             __pgprot(_PAGE_CACHE_WC),
++                                             __pgprot(_PAGE_CACHE_MASK),
++                                             0, CPA_PAGES_ARRAY, pages);
++      if (ret)
++              goto err_out;
++      return 0; /* Success */
++err_out:
++      free_idx = i;
++      for (i = 0; i < free_idx; i++) {
++              if (PageHighMem(pages[i]))
++                      continue;
++              start = page_to_pfn(pages[i]) << PAGE_SHIFT;
++              end = start + PAGE_SIZE;
++              free_memtype(start, end);
++      }
++      return -EINVAL;
++}
++
++int set_pages_array_uc(struct page **pages, int addrinarray)
++{
++      return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS);
++}
++EXPORT_SYMBOL(set_pages_array_uc);
++
++int set_pages_array_wc(struct page **pages, int addrinarray)
++{
++      return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC);
++}
++EXPORT_SYMBOL(set_pages_array_wc);
++
++int set_pages_wb(struct page *page, int numpages)
++{
++      unsigned long addr = (unsigned long)page_address(page);
++
++      return set_memory_wb(addr, numpages);
++}
++EXPORT_SYMBOL(set_pages_wb);
++
++int set_pages_array_wb(struct page **pages, int addrinarray)
++{
++      int retval;
++      unsigned long start;
++      unsigned long end;
++      int i;
++
++      retval = cpa_clear_pages_array(pages, addrinarray,
++                      __pgprot(_PAGE_CACHE_MASK));
++      if (retval)
++              return retval;
++
++      for (i = 0; i < addrinarray; i++) {
++              if (PageHighMem(pages[i]))
++                      continue;
++              start = page_to_pfn(pages[i]) << PAGE_SHIFT;
++              end = start + PAGE_SIZE;
++              free_memtype(start, end);
++      }
++
++      return 0;
++}
++EXPORT_SYMBOL(set_pages_array_wb);
++
++int set_pages_x(struct page *page, int numpages)
++{
++      unsigned long addr = (unsigned long)page_address(page);
++
++      return set_memory_x(addr, numpages);
++}
++EXPORT_SYMBOL(set_pages_x);
++
++int set_pages_nx(struct page *page, int numpages)
++{
++      unsigned long addr = (unsigned long)page_address(page);
++
++      return set_memory_nx(addr, numpages);
++}
++EXPORT_SYMBOL(set_pages_nx);
++
++int set_pages_ro(struct page *page, int numpages)
++{
++      unsigned long addr = (unsigned long)page_address(page);
++
++      return set_memory_ro(addr, numpages);
++}
++
++int set_pages_rw(struct page *page, int numpages)
++{
++      unsigned long addr = (unsigned long)page_address(page);
++
++      return set_memory_rw(addr, numpages);
++}
++
++#ifdef CONFIG_DEBUG_PAGEALLOC
++
++static int __set_pages_p(struct page *page, int numpages)
++{
++      unsigned long tempaddr = (unsigned long) page_address(page);
++      struct cpa_data cpa = { .vaddr = &tempaddr,
++                              .numpages = numpages,
++                              .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
++                              .mask_clr = __pgprot(0),
++                              .flags = 0};
++
++      /*
++       * No alias checking needed for setting present flag. otherwise,
++       * we may need to break large pages for 64-bit kernel text
++       * mappings (this adds to complexity if we want to do this from
++       * atomic context especially). Let's keep it simple!
++       */
++      return __change_page_attr_set_clr(&cpa, 0);
++}
++
++static int __set_pages_np(struct page *page, int numpages)
++{
++      unsigned long tempaddr = (unsigned long) page_address(page);
++      struct cpa_data cpa = { .vaddr = &tempaddr,
++                              .numpages = numpages,
++                              .mask_set = __pgprot(0),
++                              .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
++                              .flags = 0};
++
++      /*
++       * No alias checking needed for setting not present flag. otherwise,
++       * we may need to break large pages for 64-bit kernel text
++       * mappings (this adds to complexity if we want to do this from
++       * atomic context especially). Let's keep it simple!
++       */
++      return __change_page_attr_set_clr(&cpa, 0);
++}
++
++void kernel_map_pages(struct page *page, int numpages, int enable)
++{
++      if (PageHighMem(page))
++              return;
++      if (!enable) {
++              debug_check_no_locks_freed(page_address(page),
++                                         numpages * PAGE_SIZE);
++      }
++
++      /*
++       * If page allocator is not up yet then do not call c_p_a():
++       */
++      if (!debug_pagealloc_enabled)
++              return;
++
++      /*
++       * The return value is ignored as the calls cannot fail.
++       * Large pages for identity mappings are not used at boot time
++       * and hence no memory allocations during large page split.
++       */
++      if (enable)
++              __set_pages_p(page, numpages);
++      else
++              __set_pages_np(page, numpages);
++
++      /*
++       * We should perform an IPI and flush all tlbs,
++       * but that can deadlock->flush only current cpu:
++       */
++      __flush_tlb_all();
++}
++
++#ifdef CONFIG_HIBERNATION
++
++bool kernel_page_present(struct page *page)
++{
++      unsigned int level;
++      pte_t *pte;
++
++      if (PageHighMem(page))
++              return false;
++
++      pte = lookup_address((unsigned long)page_address(page), &level);
++      return (__pte_val(*pte) & _PAGE_PRESENT);
++}
++
++#endif /* CONFIG_HIBERNATION */
++
++#endif /* CONFIG_DEBUG_PAGEALLOC */
++
++static inline int in_secondary_range(unsigned long va)
++{
++#ifdef CONFIG_X86_64
++      return va >= VMALLOC_START && va < VMALLOC_END;
++#else
++      return va >= (unsigned long)high_memory;
++#endif
++}
++
++static void __make_page_readonly(unsigned long va)
++{
++      pte_t *pte;
++      unsigned int level;
++
++      pte = lookup_address(va, &level);
++      BUG_ON(!pte || level != PG_LEVEL_4K);
++      if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
++              BUG();
++      if (in_secondary_range(va)) {
++              unsigned long pfn = pte_pfn(*pte);
++
++#ifdef CONFIG_HIGHMEM
++              if (pfn >= highstart_pfn)
++                      kmap_flush_unused(); /* flush stale writable kmaps */
++              else
++#endif
++                      __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
++      }
++}
++
++static void __make_page_writable(unsigned long va)
++{
++      pte_t *pte;
++      unsigned int level;
++
++      pte = lookup_address(va, &level);
++      BUG_ON(!pte || level != PG_LEVEL_4K);
++      if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG))
++              BUG();
++      if (in_secondary_range(va)) {
++              unsigned long pfn = pte_pfn(*pte);
++
++#ifdef CONFIG_HIGHMEM
++              if (pfn < highstart_pfn)
++#endif
++                      __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
++      }
++}
++
++void make_page_readonly(void *va, unsigned int feature)
++{
++      if (!xen_feature(feature))
++              __make_page_readonly((unsigned long)va);
++}
++
++void make_page_writable(void *va, unsigned int feature)
++{
++      if (!xen_feature(feature))
++              __make_page_writable((unsigned long)va);
++}
++
++void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
++{
++      unsigned long addr;
++
++      if (xen_feature(feature))
++              return;
++
++      for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
++              __make_page_readonly(addr);
++}
++
++void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
++{
++      unsigned long addr;
++
++      if (xen_feature(feature))
++              return;
++
++      for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
++              __make_page_writable(addr);
++}
++
++/*
++ * The testcases use internal knowledge of the implementation that shouldn't
++ * be exposed to the rest of the kernel. Include these directly here.
++ */
++#ifdef CONFIG_CPA_DEBUG
++#include "pageattr-test.c"
++#endif
diff --cc arch/x86/mm/pat-xen.c

index 0000000,0000000..6207e52

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/pat-xen.c
@@@ -1,0 -1,0 +1,840 @@@
++/*
++ * Handle caching attributes in page tables (PAT)
++ *
++ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
++ *          Suresh B Siddha <suresh.b.siddha@intel.com>
++ *
++ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
++ */
++
++#include <linux/seq_file.h>
++#include <linux/bootmem.h>
++#include <linux/debugfs.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/mm.h>
++#include <linux/fs.h>
++#include <linux/rbtree.h>
++
++#include <asm/cacheflush.h>
++#include <asm/processor.h>
++#include <asm/tlbflush.h>
++#include <asm/x86_init.h>
++#include <asm/pgtable.h>
++#include <asm/fcntl.h>
++#include <asm/e820.h>
++#include <asm/mtrr.h>
++#include <asm/page.h>
++#include <asm/msr.h>
++#include <asm/pat.h>
++#include <asm/io.h>
++
++#include "pat_internal.h"
++
++#ifdef CONFIG_X86_PAT
++int __read_mostly pat_enabled = 1;
++
++static inline void pat_disable(const char *reason)
++{
++      pat_enabled = 0;
++      printk(KERN_INFO "%s\n", reason);
++}
++
++static int __init nopat(char *str)
++{
++      pat_disable("PAT support disabled.");
++      return 0;
++}
++early_param("nopat", nopat);
++#else
++static inline void pat_disable(const char *reason)
++{
++      (void)reason;
++}
++#endif
++
++
++int pat_debug_enable;
++
++static int __init pat_debug_setup(char *str)
++{
++      pat_debug_enable = 1;
++      return 0;
++}
++__setup("debugpat", pat_debug_setup);
++
++static u64 __read_mostly boot_pat_state;
++
++enum {
++      PAT_UC = 0,             /* uncached */
++      PAT_WC = 1,             /* Write combining */
++      PAT_WT = 4,             /* Write Through */
++      PAT_WP = 5,             /* Write Protected */
++      PAT_WB = 6,             /* Write Back (default) */
++      PAT_UC_MINUS = 7,       /* UC, but can be overriden by MTRR */
++};
++
++#define PAT(x, y)     ((u64)PAT_ ## y << ((x)*8))
++
++void pat_init(void)
++{
++      u64 pat;
++      bool boot_cpu = !boot_pat_state;
++
++      if (!pat_enabled)
++              return;
++
++      if (!cpu_has_pat) {
++              if (!boot_pat_state) {
++                      pat_disable("PAT not supported by CPU.");
++                      return;
++              } else {
++                      /*
++                       * If this happens we are on a secondary CPU, but
++                       * switched to PAT on the boot CPU. We have no way to
++                       * undo PAT.
++                       */
++                      printk(KERN_ERR "PAT enabled, "
++                             "but not supported by secondary CPU\n");
++                      BUG();
++              }
++      }
++
++#ifndef CONFIG_XEN
++      /* Set PWT to Write-Combining. All other bits stay the same */
++      /*
++       * PTE encoding used in Linux:
++       *      PAT
++       *      |PCD
++       *      ||PWT
++       *      |||
++       *      000 WB          _PAGE_CACHE_WB
++       *      001 WC          _PAGE_CACHE_WC
++       *      010 UC-         _PAGE_CACHE_UC_MINUS
++       *      011 UC          _PAGE_CACHE_UC
++       * PAT bit unused
++       */
++      pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
++            PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
++
++      /* Boot CPU check */
++      if (!boot_pat_state)
++              rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
++
++      wrmsrl(MSR_IA32_CR_PAT, pat);
++#else
++      /*
++       * PAT settings are part of the hypervisor interface, and their
++       * assignment cannot be changed.
++       */
++      rdmsrl(MSR_IA32_CR_PAT, pat);
++      if (!boot_pat_state)
++              boot_pat_state = pat;
++#endif
++
++      if (boot_cpu)
++              printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
++                     smp_processor_id(), boot_pat_state, pat);
++}
++
++#undef PAT
++
++static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
++
++static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end);
++static inline u8 _mtrr_type_lookup(u64 start, u64 end)
++{
++      if (is_initial_xendomain())
++              return mtrr_type_lookup(start, end);
++      return pat_pagerange_is_ram(start, end) > 0
++             ? MTRR_TYPE_WRCOMB : MTRR_TYPE_UNCACHABLE;
++}
++#define mtrr_type_lookup _mtrr_type_lookup
++
++/*
++ * Does intersection of PAT memory type and MTRR memory type and returns
++ * the resulting memory type as PAT understands it.
++ * (Type in pat and mtrr will not have same value)
++ * The intersection is based on "Effective Memory Type" tables in IA-32
++ * SDM vol 3a
++ */
++static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
++{
++      /*
++       * Look for MTRR hint to get the effective type in case where PAT
++       * request is for WB.
++       */
++      if (req_type == _PAGE_CACHE_WB) {
++              u8 mtrr_type;
++
++              mtrr_type = mtrr_type_lookup(start, end);
++              if (mtrr_type != MTRR_TYPE_WRBACK)
++                      return _PAGE_CACHE_UC_MINUS;
++
++              return _PAGE_CACHE_WB;
++      }
++
++      return req_type;
++}
++
++static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
++{
++      int ram_page = 0, not_rampage = 0;
++      unsigned long page_nr;
++
++      for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
++           ++page_nr) {
++              /*
++               * For legacy reasons, physical address range in the legacy ISA
++               * region is tracked as non-RAM. This will allow users of
++               * /dev/mem to map portions of legacy ISA region, even when
++               * some of those portions are listed(or not even listed) with
++               * different e820 types(RAM/reserved/..)
++               */
++              if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
++                  page_is_ram(mfn_to_local_pfn(page_nr)))
++                      ram_page = 1;
++              else
++                      not_rampage = 1;
++
++              if (ram_page == not_rampage)
++                      return -1;
++      }
++
++      return ram_page;
++}
++
++/*
++ * For RAM pages, we use page flags to mark the pages with appropriate type.
++ * Here we do two pass:
++ * - Find the memtype of all the pages in the range, look for any conflicts
++ * - In case of no conflicts, set the new memtype for pages in the range
++ */
++static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
++                                unsigned long *new_type)
++{
++      struct page *page;
++      unsigned long mfn;
++
++      if (req_type == _PAGE_CACHE_UC) {
++              /* We do not support strong UC */
++              WARN_ON_ONCE(1);
++              req_type = _PAGE_CACHE_UC_MINUS;
++      }
++
++      for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) {
++              unsigned long type, pfn = mfn_to_local_pfn(mfn);
++
++              BUG_ON(!pfn_valid(pfn));
++              page = pfn_to_page(pfn);
++              type = get_page_memtype(page);
++              if (type != -1) {
++                      printk(KERN_INFO "reserve_ram_pages_type failed "
++                              "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
++                              start, end, type, req_type);
++                      if (new_type)
++                              *new_type = type;
++
++                      return -EBUSY;
++              }
++      }
++
++      if (new_type)
++              *new_type = req_type;
++
++      for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) {
++              page = pfn_to_page(mfn_to_local_pfn(mfn));
++              set_page_memtype(page, req_type);
++      }
++      return 0;
++}
++
++static int free_ram_pages_type(u64 start, u64 end)
++{
++      struct page *page;
++      unsigned long mfn;
++
++      for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) {
++              unsigned long pfn = mfn_to_local_pfn(mfn);
++
++              BUG_ON(!pfn_valid(pfn));
++              page = pfn_to_page(pfn);
++              set_page_memtype(page, -1);
++      }
++      return 0;
++}
++
++/*
++ * req_type typically has one of the:
++ * - _PAGE_CACHE_WB
++ * - _PAGE_CACHE_WC
++ * - _PAGE_CACHE_UC_MINUS
++ * - _PAGE_CACHE_UC
++ *
++ * If new_type is NULL, function will return an error if it cannot reserve the
++ * region with req_type. If new_type is non-NULL, function will return
++ * available type in new_type in case of no error. In case of any error
++ * it will return a negative return value.
++ */
++int reserve_memtype(u64 start, u64 end, unsigned long req_type,
++                  unsigned long *new_type)
++{
++      struct memtype *new;
++      unsigned long actual_type;
++      int is_range_ram;
++      int err = 0;
++
++      BUG_ON(start >= end); /* end is exclusive */
++
++      if (!pat_enabled) {
++              /* This is identical to page table setting without PAT */
++              if (new_type) {
++                      if (req_type == _PAGE_CACHE_WC)
++                              *new_type = _PAGE_CACHE_UC_MINUS;
++                      else
++                              *new_type = req_type & _PAGE_CACHE_MASK;
++              }
++              return 0;
++      }
++
++      /* Low ISA region is always mapped WB in page table. No need to track */
++      if (x86_platform.is_untracked_pat_range(start, end)) {
++              if (new_type)
++                      *new_type = _PAGE_CACHE_WB;
++              return 0;
++      }
++
++      /*
++       * Call mtrr_lookup to get the type hint. This is an
++       * optimization for /dev/mem mmap'ers into WB memory (BIOS
++       * tools and ACPI tools). Use WB request for WB memory and use
++       * UC_MINUS otherwise.
++       */
++      actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);
++
++      if (new_type)
++              *new_type = actual_type;
++
++      is_range_ram = pat_pagerange_is_ram(start, end);
++      if (is_range_ram == 1) {
++
++              err = reserve_ram_pages_type(start, end, req_type, new_type);
++
++              return err;
++      } else if (is_range_ram < 0) {
++              return -EINVAL;
++      }
++
++      new  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
++      if (!new)
++              return -ENOMEM;
++
++      new->start      = start;
++      new->end        = end;
++      new->type       = actual_type;
++
++      spin_lock(&memtype_lock);
++
++      err = rbt_memtype_check_insert(new, new_type);
++      if (err) {
++              printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
++                     "track %s, req %s\n",
++                     start, end, cattr_name(new->type), cattr_name(req_type));
++              kfree(new);
++              spin_unlock(&memtype_lock);
++
++              return err;
++      }
++
++      spin_unlock(&memtype_lock);
++
++      dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
++              start, end, cattr_name(new->type), cattr_name(req_type),
++              new_type ? cattr_name(*new_type) : "-");
++
++      return err;
++}
++
++int free_memtype(u64 start, u64 end)
++{
++      int err = -EINVAL;
++      int is_range_ram;
++      struct memtype *entry;
++
++      if (!pat_enabled)
++              return 0;
++
++      /* Low ISA region is always mapped WB. No need to track */
++      if (x86_platform.is_untracked_pat_range(start, end))
++              return 0;
++
++      is_range_ram = pat_pagerange_is_ram(start, end);
++      if (is_range_ram == 1) {
++
++              err = free_ram_pages_type(start, end);
++
++              return err;
++      } else if (is_range_ram < 0) {
++              return -EINVAL;
++      }
++
++      spin_lock(&memtype_lock);
++      entry = rbt_memtype_erase(start, end);
++      spin_unlock(&memtype_lock);
++
++      if (!entry) {
++              printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
++                      current->comm, current->pid, start, end);
++              return -EINVAL;
++      }
++
++      kfree(entry);
++
++      dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
++
++      return 0;
++}
++
++
++#ifndef CONFIG_XEN
++/**
++ * lookup_memtype - Looksup the memory type for a physical address
++ * @paddr: physical address of which memory type needs to be looked up
++ *
++ * Only to be called when PAT is enabled
++ *
++ * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
++ * _PAGE_CACHE_UC
++ */
++static unsigned long lookup_memtype(u64 paddr)
++{
++      int rettype = _PAGE_CACHE_WB;
++      struct memtype *entry;
++
++      if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
++              return rettype;
++
++      if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
++              struct page *page;
++              page = pfn_to_page(paddr >> PAGE_SHIFT);
++              rettype = get_page_memtype(page);
++              /*
++               * -1 from get_page_memtype() implies RAM page is in its
++               * default state and not reserved, and hence of type WB
++               */
++              if (rettype == -1)
++                      rettype = _PAGE_CACHE_WB;
++
++              return rettype;
++      }
++
++      spin_lock(&memtype_lock);
++
++      entry = rbt_memtype_lookup(paddr);
++      if (entry != NULL)
++              rettype = entry->type;
++      else
++              rettype = _PAGE_CACHE_UC_MINUS;
++
++      spin_unlock(&memtype_lock);
++      return rettype;
++}
++#endif
++
++/**
++ * io_reserve_memtype - Request a memory type mapping for a region of memory
++ * @start: start (physical address) of the region
++ * @end: end (physical address) of the region
++ * @type: A pointer to memtype, with requested type. On success, requested
++ * or any other compatible type that was available for the region is returned
++ *
++ * On success, returns 0
++ * On failure, returns non-zero
++ */
++int io_reserve_memtype(resource_size_t start, resource_size_t end,
++                      unsigned long *type)
++{
++      resource_size_t size = end - start;
++      unsigned long req_type = *type;
++      unsigned long new_type;
++      int ret;
++
++      WARN_ON_ONCE(iomem_map_sanity_check(start, size));
++
++      ret = reserve_memtype(start, end, req_type, &new_type);
++      if (ret)
++              goto out_err;
++
++      if (!is_new_memtype_allowed(start, size, req_type, new_type))
++              goto out_free;
++
++      if (kernel_map_sync_memtype(start, size, new_type) < 0)
++              goto out_free;
++
++      *type = new_type;
++      return 0;
++
++out_free:
++      free_memtype(start, end);
++      ret = -EBUSY;
++out_err:
++      return ret;
++}
++
++/**
++ * io_free_memtype - Release a memory type mapping for a region of memory
++ * @start: start (physical address) of the region
++ * @end: end (physical address) of the region
++ */
++void io_free_memtype(resource_size_t start, resource_size_t end)
++{
++      free_memtype(start, end);
++}
++
++pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
++                              unsigned long size, pgprot_t vma_prot)
++{
++      return vma_prot;
++}
++
++#ifdef CONFIG_STRICT_DEVMEM
++/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
++static inline int range_is_allowed(unsigned long mfn, unsigned long size)
++{
++      return 1;
++}
++#else
++/* This check is needed to avoid cache aliasing when PAT is enabled */
++static inline int range_is_allowed(unsigned long mfn, unsigned long size)
++{
++      u64 from = ((u64)mfn) << PAGE_SHIFT;
++      u64 to = from + size;
++      u64 cursor = from;
++
++      if (!pat_enabled)
++              return 1;
++
++      while (cursor < to) {
++              if (!devmem_is_allowed(mfn)) {
++                      printk(KERN_INFO
++              "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
++                              current->comm, from, to);
++                      return 0;
++              }
++              cursor += PAGE_SIZE;
++              mfn++;
++      }
++      return 1;
++}
++#endif /* CONFIG_STRICT_DEVMEM */
++
++int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
++                              unsigned long size, pgprot_t *vma_prot)
++{
++      unsigned long flags = _PAGE_CACHE_WB;
++
++      if (!range_is_allowed(mfn, size))
++              return 0;
++
++      if (file->f_flags & O_DSYNC)
++              flags = _PAGE_CACHE_UC_MINUS;
++
++#ifndef CONFIG_X86_32
++#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */
++      /*
++       * On the PPro and successors, the MTRRs are used to set
++       * memory types for physical addresses outside main memory,
++       * so blindly setting UC or PWT on those pages is wrong.
++       * For Pentiums and earlier, the surround logic should disable
++       * caching for the high addresses through the KEN pin, but
++       * we maintain the tradition of paranoia in this code.
++       */
++      if (!pat_enabled &&
++          !(boot_cpu_has(X86_FEATURE_MTRR) ||
++            boot_cpu_has(X86_FEATURE_K6_MTRR) ||
++            boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
++            boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
++          (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
++              flags = _PAGE_CACHE_UC;
++      }
++#endif
++#endif
++
++      *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
++                           flags);
++      return 1;
++}
++
++/*
++ * Change the memory type for the physial address range in kernel identity
++ * mapping space if that range is a part of identity map.
++ */
++int kernel_map_sync_memtype(u64 ma, unsigned long size, unsigned long flags)
++{
++      return ioremap_check_change_attr(ma >> PAGE_SHIFT, size, flags);
++}
++
++#ifndef CONFIG_XEN
++/*
++ * Internal interface to reserve a range of physical memory with prot.
++ * Reserved non RAM regions only and after successful reserve_memtype,
++ * this func also keeps identity mapping (if any) in sync with this new prot.
++ */
++static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
++                              int strict_prot)
++{
++      int is_ram = 0;
++      int ret;
++      unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
++      unsigned long flags = want_flags;
++
++      is_ram = pat_pagerange_is_ram(paddr, paddr + size);
++
++      /*
++       * reserve_pfn_range() for RAM pages. We do not refcount to keep
++       * track of number of mappings of RAM pages. We can assert that
++       * the type requested matches the type of first page in the range.
++       */
++      if (is_ram) {
++              if (!pat_enabled)
++                      return 0;
++
++              flags = lookup_memtype(paddr);
++              if (want_flags != flags) {
++                      printk(KERN_WARNING
++                      "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
++                              current->comm, current->pid,
++                              cattr_name(want_flags),
++                              (unsigned long long)paddr,
++                              (unsigned long long)(paddr + size),
++                              cattr_name(flags));
++                      *vma_prot = __pgprot((pgprot_val(*vma_prot) &
++                                            (~_PAGE_CACHE_MASK)) |
++                                           flags);
++              }
++              return 0;
++      }
++
++      ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
++      if (ret)
++              return ret;
++
++      if (flags != want_flags) {
++              if (strict_prot ||
++                  !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
++                      free_memtype(paddr, paddr + size);
++                      printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
++                              " for %Lx-%Lx, got %s\n",
++                              current->comm, current->pid,
++                              cattr_name(want_flags),
++                              (unsigned long long)paddr,
++                              (unsigned long long)(paddr + size),
++                              cattr_name(flags));
++                      return -EINVAL;
++              }
++              /*
++               * We allow returning different type than the one requested in
++               * non strict case.
++               */
++              *vma_prot = __pgprot((pgprot_val(*vma_prot) &
++                                    (~_PAGE_CACHE_MASK)) |
++                                   flags);
++      }
++
++      if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
++              free_memtype(paddr, paddr + size);
++              return -EINVAL;
++      }
++      return 0;
++}
++
++/*
++ * Internal interface to free a range of physical memory.
++ * Frees non RAM regions only.
++ */
++static void free_pfn_range(u64 paddr, unsigned long size)
++{
++      int is_ram;
++
++      is_ram = pat_pagerange_is_ram(paddr, paddr + size);
++      if (is_ram == 0)
++              free_memtype(paddr, paddr + size);
++}
++
++/*
++ * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
++ * copied through copy_page_range().
++ *
++ * If the vma has a linear pfn mapping for the entire range, we get the prot
++ * from pte and reserve the entire vma range with single reserve_pfn_range call.
++ */
++int track_pfn_vma_copy(struct vm_area_struct *vma)
++{
++      resource_size_t paddr;
++      unsigned long prot;
++      unsigned long vma_size = vma->vm_end - vma->vm_start;
++      pgprot_t pgprot;
++
++      if (is_linear_pfn_mapping(vma)) {
++              /*
++               * reserve the whole chunk covered by vma. We need the
++               * starting address and protection from pte.
++               */
++              if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
++                      WARN_ON_ONCE(1);
++                      return -EINVAL;
++              }
++              pgprot = __pgprot(prot);
++              return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
++      }
++
++      return 0;
++}
++
++/*
++ * track_pfn_vma_new is called when a _new_ pfn mapping is being established
++ * for physical range indicated by pfn and size.
++ *
++ * prot is passed in as a parameter for the new mapping. If the vma has a
++ * linear pfn mapping for the entire range reserve the entire vma range with
++ * single reserve_pfn_range call.
++ */
++int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
++                      unsigned long pfn, unsigned long size)
++{
++      unsigned long flags;
++      resource_size_t paddr;
++      unsigned long vma_size = vma->vm_end - vma->vm_start;
++
++      if (is_linear_pfn_mapping(vma)) {
++              /* reserve the whole chunk starting from vm_pgoff */
++              paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
++              return reserve_pfn_range(paddr, vma_size, prot, 0);
++      }
++
++      if (!pat_enabled)
++              return 0;
++
++      /* for vm_insert_pfn and friends, we set prot based on lookup */
++      flags = lookup_memtype(pfn << PAGE_SHIFT);
++      *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
++                       flags);
++
++      return 0;
++}
++
++/*
++ * untrack_pfn_vma is called while unmapping a pfnmap for a region.
++ * untrack can be called for a specific region indicated by pfn and size or
++ * can be for the entire vma (in which case size can be zero).
++ */
++void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
++                      unsigned long size)
++{
++      resource_size_t paddr;
++      unsigned long vma_size = vma->vm_end - vma->vm_start;
++
++      if (is_linear_pfn_mapping(vma)) {
++              /* free the whole chunk starting from vm_pgoff */
++              paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
++              free_pfn_range(paddr, vma_size);
++              return;
++      }
++}
++#endif /* CONFIG_XEN */
++
++pgprot_t pgprot_writecombine(pgprot_t prot)
++{
++      if (pat_enabled)
++              return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
++      else
++              return pgprot_noncached(prot);
++}
++EXPORT_SYMBOL_GPL(pgprot_writecombine);
++
++#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
++
++static struct memtype *memtype_get_idx(loff_t pos)
++{
++      struct memtype *print_entry;
++      int ret;
++
++      print_entry  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
++      if (!print_entry)
++              return NULL;
++
++      spin_lock(&memtype_lock);
++      ret = rbt_memtype_copy_nth_element(print_entry, pos);
++      spin_unlock(&memtype_lock);
++
++      if (!ret) {
++              return print_entry;
++      } else {
++              kfree(print_entry);
++              return NULL;
++      }
++}
++
++static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
++{
++      if (*pos == 0) {
++              ++*pos;
++              seq_printf(seq, "PAT memtype list:\n");
++      }
++
++      return memtype_get_idx(*pos);
++}
++
++static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++{
++      ++*pos;
++      return memtype_get_idx(*pos);
++}
++
++static void memtype_seq_stop(struct seq_file *seq, void *v)
++{
++}
++
++static int memtype_seq_show(struct seq_file *seq, void *v)
++{
++      struct memtype *print_entry = (struct memtype *)v;
++
++      seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
++                      print_entry->start, print_entry->end);
++      kfree(print_entry);
++
++      return 0;
++}
++
++static const struct seq_operations memtype_seq_ops = {
++      .start = memtype_seq_start,
++      .next  = memtype_seq_next,
++      .stop  = memtype_seq_stop,
++      .show  = memtype_seq_show,
++};
++
++static int memtype_seq_open(struct inode *inode, struct file *file)
++{
++      return seq_open(file, &memtype_seq_ops);
++}
++
++static const struct file_operations memtype_fops = {
++      .open    = memtype_seq_open,
++      .read    = seq_read,
++      .llseek  = seq_lseek,
++      .release = seq_release,
++};
++
++static int __init pat_memtype_list_init(void)
++{
++      if (pat_enabled) {
++              debugfs_create_file("pat_memtype_list", S_IRUSR,
++                                  arch_debugfs_dir, NULL, &memtype_fops);
++      }
++      return 0;
++}
++
++late_initcall(pat_memtype_list_init);
++
++#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --cc arch/x86/mm/pat_internal.h

index 77e5ba1,77e5ba1..dbed48d
--- 1/arch/x86/mm/pat_internal.h
--- 2/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@@ -21,6 -21,6 +21,10 @@@ static inline char *cattr_name(unsigne
         case _PAGE_CACHE_UC_MINUS:      return "uncached-minus";
         case _PAGE_CACHE_WB:            return "write-back";
         case _PAGE_CACHE_WC:            return "write-combining";
++#ifdef CONFIG_XEN
++      case _PAGE_CACHE_WP:            return "write-protected";
++      case _PAGE_CACHE_WT:            return "write-through";
++#endif
         default:                        return "broken";
         }
   }
diff --cc arch/x86/mm/pgtable-xen.c

index 0000000,0000000..ab7cae5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/pgtable-xen.c
@@@ -1,0 -1,0 +1,969 @@@
++#include <linux/mm.h>
++#include <linux/gfp.h>
++#include <linux/module.h>
++#include <linux/smp.h>
++#include <xen/features.h>
++#include <asm/pgalloc.h>
++#include <asm/pgtable.h>
++#include <asm/tlb.h>
++#include <asm/fixmap.h>
++#include <asm/hypervisor.h>
++#include <asm/mmu_context.h>
++
++#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
++
++#ifdef CONFIG_HIGHPTE
++#define PGALLOC_USER_GFP __GFP_HIGHMEM
++#else
++#define PGALLOC_USER_GFP 0
++#endif
++
++gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
++
++pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
++{
++      pte_t *pte = (pte_t *)__get_free_page(PGALLOC_GFP);
++      if (pte)
++              make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
++      return pte;
++}
++
++static void _pte_free(struct page *page, unsigned int order)
++{
++      BUG_ON(order);
++      __pte_free(page);
++}
++
++pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
++{
++      struct page *pte;
++
++      pte = alloc_pages(__userpte_alloc_gfp, 0);
++      if (pte) {
++              pgtable_page_ctor(pte);
++              SetPageForeign(pte, _pte_free);
++              init_page_count(pte);
++      }
++      return pte;
++}
++
++static int __init setup_userpte(char *arg)
++{
++      if (!arg)
++              return -EINVAL;
++
++      /*
++       * "userpte=nohigh" disables allocation of user pagetables in
++       * high memory.
++       */
++      if (strcmp(arg, "nohigh") == 0)
++              __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
++      else
++              return -EINVAL;
++      return 0;
++}
++early_param("userpte", setup_userpte);
++
++void __pte_free(pgtable_t pte)
++{
++      if (!PageHighMem(pte)) {
++              if (PagePinned(pte)) {
++                      unsigned long pfn = page_to_pfn(pte);
++
++                      if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
++                                                       pfn_pte(pfn,
++                                                               PAGE_KERNEL),
++                                                       0))
++                              BUG();
++                      ClearPagePinned(pte);
++              }
++      } else
++#ifdef CONFIG_HIGHPTE
++              ClearPagePinned(pte);
++#else
++              BUG();
++#endif
++
++      ClearPageForeign(pte);
++      init_page_count(pte);
++      pgtable_page_dtor(pte);
++      __free_page(pte);
++}
++
++void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
++{
++      pgtable_page_dtor(pte);
++      paravirt_release_pte(page_to_pfn(pte));
++      tlb_remove_page(tlb, pte);
++}
++
++#if PAGETABLE_LEVELS > 2
++static void _pmd_free(struct page *page, unsigned int order)
++{
++      BUG_ON(order);
++      __pmd_free(page);
++}
++
++pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
++{
++      struct page *pmd;
++
++      pmd = alloc_pages(PGALLOC_GFP, 0);
++      if (!pmd)
++              return NULL;
++      SetPageForeign(pmd, _pmd_free);
++      init_page_count(pmd);
++      return page_address(pmd);
++}
++
++void __pmd_free(pgtable_t pmd)
++{
++      if (PagePinned(pmd)) {
++              unsigned long pfn = page_to_pfn(pmd);
++
++              if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
++                                               pfn_pte(pfn, PAGE_KERNEL),
++                                               0))
++                      BUG();
++              ClearPagePinned(pmd);
++      }
++
++      ClearPageForeign(pmd);
++      init_page_count(pmd);
++      __free_page(pmd);
++}
++
++void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
++{
++      paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
++      tlb_remove_page(tlb, virt_to_page(pmd));
++}
++
++#if PAGETABLE_LEVELS > 3
++void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
++{
++      paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
++      tlb_remove_page(tlb, virt_to_page(pud));
++}
++#endif        /* PAGETABLE_LEVELS > 3 */
++#endif        /* PAGETABLE_LEVELS > 2 */
++
++static void _pin_lock(struct mm_struct *mm, int lock) {
++      if (lock)
++              spin_lock(&mm->page_table_lock);
++#if USE_SPLIT_PTLOCKS
++      /* While mm->page_table_lock protects us against insertions and
++       * removals of higher level page table pages, it doesn't protect
++       * against updates of pte-s. Such updates, however, require the
++       * pte pages to be in consistent state (unpinned+writable or
++       * pinned+readonly). The pinning and attribute changes, however
++       * cannot be done atomically, which is why such updates must be
++       * prevented from happening concurrently.
++       * Note that no pte lock can ever elsewhere be acquired nesting
++       * with an already acquired one in the same mm, or with the mm's
++       * page_table_lock already acquired, as that would break in the
++       * non-split case (where all these are actually resolving to the
++       * one page_table_lock). Thus acquiring all of them here is not
++       * going to result in dead locks, and the order of acquires
++       * doesn't matter.
++       */
++      {
++              pgd_t *pgd = mm->pgd;
++              unsigned g;
++
++              for (g = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
++                      pud_t *pud;
++                      unsigned u;
++
++                      if (pgd_none(*pgd))
++                              continue;
++                      pud = pud_offset(pgd, 0);
++                      for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
++                              pmd_t *pmd;
++                              unsigned m;
++
++                              if (pud_none(*pud))
++                                      continue;
++                              pmd = pmd_offset(pud, 0);
++                              for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
++                                      spinlock_t *ptl;
++
++                                      if (pmd_none(*pmd))
++                                              continue;
++                                      ptl = pte_lockptr(0, pmd);
++                                      if (lock)
++                                              spin_lock(ptl);
++                                      else
++                                              spin_unlock(ptl);
++                              }
++                      }
++              }
++      }
++#endif
++      if (!lock)
++              spin_unlock(&mm->page_table_lock);
++}
++#define pin_lock(mm) _pin_lock(mm, 1)
++#define pin_unlock(mm) _pin_lock(mm, 0)
++
++#define PIN_BATCH sizeof(void *)
++static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
++
++static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
++                                           unsigned int cpu, unsigned int seq)
++{
++      unsigned long pfn = page_to_pfn(page);
++
++      if (pgprot_val(flags) & _PAGE_RW)
++              ClearPagePinned(page);
++      else
++              SetPagePinned(page);
++      if (PageHighMem(page))
++              return seq;
++      MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
++                              (unsigned long)__va(pfn << PAGE_SHIFT),
++                              pfn_pte(pfn, flags), 0);
++      if (unlikely(++seq == PIN_BATCH)) {
++              if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
++                                                      PIN_BATCH, NULL)))
++                      BUG();
++              seq = 0;
++      }
++
++      return seq;
++}
++
++static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
++{
++      pgd_t       *pgd = pgd_base;
++      pud_t       *pud;
++      pmd_t       *pmd;
++      int          g,u,m;
++      unsigned int cpu, seq;
++      multicall_entry_t *mcl;
++
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              return;
++
++      cpu = get_cpu();
++
++      /*
++       * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
++       * may not be the 'current' task's pagetables (e.g., current may be
++       * 32-bit, but the pagetables may be for a 64-bit task).
++       * Subtracting 1 from TASK_SIZE_MAX means the loop limit is correct
++       * regardless of whether TASK_SIZE_MAX is a multiple of PGDIR_SIZE.
++       */
++      for (g = 0, seq = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
++              if (pgd_none(*pgd))
++                      continue;
++              pud = pud_offset(pgd, 0);
++              if (PTRS_PER_PUD > 1) /* not folded */
++                      seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
++              for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
++                      if (pud_none(*pud))
++                              continue;
++                      pmd = pmd_offset(pud, 0);
++                      if (PTRS_PER_PMD > 1) /* not folded */
++                              seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
++                      for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
++                              if (pmd_none(*pmd))
++                                      continue;
++                              seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
++                      }
++              }
++      }
++
++#ifdef CONFIG_X86_PAE
++      for (; g < PTRS_PER_PGD; g++, pgd++) {
++              BUG_ON(pgd_none(*pgd));
++              pud = pud_offset(pgd, 0);
++              BUG_ON(pud_none(*pud));
++              pmd = pmd_offset(pud, 0);
++              seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
++      }
++#endif
++
++      mcl = per_cpu(pb_mcl, cpu);
++#ifdef CONFIG_X86_64
++      if (unlikely(seq > PIN_BATCH - 2)) {
++              if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
++                      BUG();
++              seq = 0;
++      }
++      pgd = __user_pgd(pgd_base);
++      BUG_ON(!pgd);
++      MULTI_update_va_mapping(mcl + seq,
++             (unsigned long)pgd,
++             pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, flags),
++             0);
++      MULTI_update_va_mapping(mcl + seq + 1,
++             (unsigned long)pgd_base,
++             pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
++             UVMF_TLB_FLUSH);
++      if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
++              BUG();
++#else
++      if (likely(seq != 0)) {
++              MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
++                      (unsigned long)pgd_base,
++                      pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
++                      UVMF_TLB_FLUSH);
++              if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
++                                                      seq + 1, NULL)))
++                      BUG();
++      } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
++                      pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
++                      UVMF_TLB_FLUSH))
++              BUG();
++#endif
++
++      put_cpu();
++}
++
++void __init xen_init_pgd_pin(void)
++{
++      pgd_t       *pgd = init_mm.pgd;
++      pud_t       *pud;
++      pmd_t       *pmd;
++      unsigned int g, u, m;
++
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              return;
++
++      SetPagePinned(virt_to_page(pgd));
++      for (g = 0; g < PTRS_PER_PGD; g++, pgd++) {
++#ifndef CONFIG_X86_PAE
++              if (g >= pgd_index(HYPERVISOR_VIRT_START)
++                  && g <= pgd_index(HYPERVISOR_VIRT_END - 1))
++                      continue;
++#endif
++              if (!pgd_present(*pgd))
++                      continue;
++              pud = pud_offset(pgd, 0);
++              if (PTRS_PER_PUD > 1) /* not folded */
++                      SetPagePinned(virt_to_page(pud));
++              for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
++                      if (!pud_present(*pud) || pud_large(*pud))
++                              continue;
++                      pmd = pmd_offset(pud, 0);
++                      if (PTRS_PER_PMD > 1) /* not folded */
++                              SetPagePinned(virt_to_page(pmd));
++                      for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
++#ifdef CONFIG_X86_PAE
++                              if (g == pgd_index(HYPERVISOR_VIRT_START)
++                                  && m >= pmd_index(HYPERVISOR_VIRT_START))
++                                      continue;
++#endif
++                              if (!pmd_present(*pmd) || pmd_large(*pmd))
++                                      continue;
++                              SetPagePinned(pmd_page(*pmd));
++                      }
++              }
++      }
++#ifdef CONFIG_X86_64
++      SetPagePinned(virt_to_page(level3_user_pgt));
++#endif
++}
++
++static void __pgd_pin(pgd_t *pgd)
++{
++      pgd_walk(pgd, PAGE_KERNEL_RO);
++      kmap_flush_unused();
++      xen_pgd_pin(pgd);
++      SetPagePinned(virt_to_page(pgd));
++}
++
++static void __pgd_unpin(pgd_t *pgd)
++{
++      xen_pgd_unpin(pgd);
++      pgd_walk(pgd, PAGE_KERNEL);
++      ClearPagePinned(virt_to_page(pgd));
++}
++
++static void pgd_test_and_unpin(pgd_t *pgd)
++{
++      if (PagePinned(virt_to_page(pgd)))
++              __pgd_unpin(pgd);
++}
++
++void mm_pin(struct mm_struct *mm)
++{
++      if (xen_feature(XENFEAT_writable_page_tables))
++              return;
++
++      pin_lock(mm);
++      __pgd_pin(mm->pgd);
++      pin_unlock(mm);
++}
++
++void mm_unpin(struct mm_struct *mm)
++{
++      if (xen_feature(XENFEAT_writable_page_tables))
++              return;
++
++      pin_lock(mm);
++      __pgd_unpin(mm->pgd);
++      pin_unlock(mm);
++}
++
++void mm_pin_all(void)
++{
++      struct page *page;
++
++      if (xen_feature(XENFEAT_writable_page_tables))
++              return;
++
++      /*
++       * Allow uninterrupted access to the pgd_list. Also protects
++       * __pgd_pin() by ensuring preemption is disabled.
++       * All other CPUs must be at a safe point (e.g., in stop_machine
++       * or offlined entirely).
++       */
++      BUG_ON(!irqs_disabled());
++      spin_lock(&pgd_lock);
++      list_for_each_entry(page, &pgd_list, lru) {
++              if (!PagePinned(page))
++                      __pgd_pin((pgd_t *)page_address(page));
++      }
++      spin_unlock(&pgd_lock);
++}
++
++void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
++{
++      if (!PagePinned(virt_to_page(mm->pgd)))
++              mm_pin(mm);
++}
++
++/*
++ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() *much*
++ * faster this way, as no hypercalls are needed for the page table updates.
++ */
++static void leave_active_mm(struct task_struct *tsk, struct mm_struct *mm)
++      __releases(tsk->alloc_lock)
++{
++      if (tsk->active_mm == mm) {
++              tsk->active_mm = &init_mm;
++              atomic_inc(&init_mm.mm_count);
++
++              switch_mm(mm, &init_mm, tsk);
++
++              if (atomic_dec_and_test(&mm->mm_count))
++                      BUG();
++      }
++
++      task_unlock(tsk);
++}
++
++static void _leave_active_mm(void *mm)
++{
++      struct task_struct *tsk = current;
++
++      if (spin_trylock(&tsk->alloc_lock))
++              leave_active_mm(tsk, mm);
++}
++
++void arch_exit_mmap(struct mm_struct *mm)
++{
++      struct task_struct *tsk = current;
++
++      task_lock(tsk);
++      leave_active_mm(tsk, mm);
++
++      preempt_disable();
++      smp_call_function_many(mm_cpumask(mm), _leave_active_mm, mm, 1);
++      preempt_enable();
++
++      if (PagePinned(virt_to_page(mm->pgd))
++          && atomic_read(&mm->mm_count) == 1
++          && !mm->context.has_foreign_mappings)
++              mm_unpin(mm);
++}
++
++static inline void pgd_list_add(pgd_t *pgd)
++{
++      struct page *page = virt_to_page(pgd);
++
++      list_add(&page->lru, &pgd_list);
++}
++
++static inline void pgd_list_del(pgd_t *pgd)
++{
++      struct page *page = virt_to_page(pgd);
++
++      list_del(&page->lru);
++}
++
++#define UNSHARED_PTRS_PER_PGD                         \
++      (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
++
++
++static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
++{
++      BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
++      virt_to_page(pgd)->index = (pgoff_t)mm;
++}
++
++struct mm_struct *pgd_page_get_mm(struct page *page)
++{
++      return (struct mm_struct *)page->index;
++}
++
++static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
++{
++      pgd_test_and_unpin(pgd);
++
++      /* If the pgd points to a shared pagetable level (either the
++         ptes in non-PAE, or shared PMD in PAE), then just copy the
++         references from swapper_pg_dir. */
++      if (PAGETABLE_LEVELS == 2 ||
++          (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
++          PAGETABLE_LEVELS == 4) {
++              clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
++                              swapper_pg_dir + KERNEL_PGD_BOUNDARY,
++                              KERNEL_PGD_PTRS);
++      }
++
++#ifdef CONFIG_X86_64
++      /* set level3_user_pgt for vsyscall area */
++      __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
++              __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
++#endif
++
++      /* list required to sync kernel mapping updates */
++      if (!SHARED_KERNEL_PMD) {
++              pgd_set_mm(pgd, mm);
++              pgd_list_add(pgd);
++      }
++}
++
++static void pgd_dtor(pgd_t *pgd)
++{
++      if (!SHARED_KERNEL_PMD) {
++              spin_lock(&pgd_lock);
++              pgd_list_del(pgd);
++              spin_unlock(&pgd_lock);
++      }
++
++      pgd_test_and_unpin(pgd);
++}
++
++/*
++ * List of all pgd's needed for non-PAE so it can invalidate entries
++ * in both cached and uncached pgd's; not needed for PAE since the
++ * kernel pmd is shared. If PAE were not to share the pmd a similar
++ * tactic would be needed. This is essentially codepath-based locking
++ * against pageattr.c; it is the unique case in which a valid change
++ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
++ * vmalloc faults work because attached pagetables are never freed.
++ * -- wli
++ */
++
++#ifdef CONFIG_X86_PAE
++/*
++ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
++ * updating the top-level pagetable entries to guarantee the
++ * processor notices the update.  Since this is expensive, and
++ * all 4 top-level entries are used almost immediately in a
++ * new process's life, we just pre-populate them here.
++ *
++ * Also, if we're in a paravirt environment where the kernel pmd is
++ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
++ * and initialize the kernel pmds here.
++ */
++#define PREALLOCATED_PMDS     UNSHARED_PTRS_PER_PGD
++
++void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
++{
++      /* Note: almost everything apart from _PAGE_PRESENT is
++         reserved at the pmd (PDPT) level. */
++      pud_t pud = __pud(__pa(pmd) | _PAGE_PRESENT);
++
++      paravirt_alloc_pmd(mm, page_to_pfn(virt_to_page(pmd)));
++
++      if (likely(!PagePinned(virt_to_page(pudp)))) {
++              *pudp = pud;
++              return;
++      }
++
++      set_pud(pudp, pud);
++
++      /*
++       * According to Intel App note "TLBs, Paging-Structure Caches,
++       * and Their Invalidation", April 2007, document 317080-001,
++       * section 8.1: in PAE mode we explicitly have to flush the
++       * TLB via cr3 if the top-level pgd is changed...
++       */
++      flush_tlb_mm(mm);
++}
++#else  /* !CONFIG_X86_PAE */
++
++/* No need to prepopulate any pagetable entries in non-PAE modes. */
++#define PREALLOCATED_PMDS     0
++
++#endif        /* CONFIG_X86_PAE */
++
++static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
++{
++      int i;
++
++#ifdef CONFIG_X86_PAE
++      if (contig)
++              xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
++#endif
++
++      for(i = 0; i < PREALLOCATED_PMDS; i++)
++              if (pmds[i])
++                      pmd_free(mm, pmds[i]);
++}
++
++static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
++{
++      int i;
++      bool failed = false;
++
++      for(i = 0; i < PREALLOCATED_PMDS; i++) {
++              pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
++              if (pmd == NULL)
++                      failed = true;
++              pmds[i] = pmd;
++      }
++
++      if (failed) {
++              free_pmds(pmds, mm, false);
++              return -ENOMEM;
++      }
++
++      return 0;
++}
++
++/*
++ * Mop up any pmd pages which may still be attached to the pgd.
++ * Normally they will be freed by munmap/exit_mmap, but any pmd we
++ * preallocate which never got a corresponding vma will need to be
++ * freed manually.
++ */
++static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
++{
++      int i;
++
++      for(i = 0; i < PREALLOCATED_PMDS; i++) {
++              pgd_t pgd = pgdp[i];
++
++              if (__pgd_val(pgd) != 0) {
++                      pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
++
++                      pgdp[i] = xen_make_pgd(0);
++
++                      paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
++                      pmd_free(mm, pmd);
++              }
++      }
++
++#ifdef CONFIG_X86_PAE
++      if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
++              xen_destroy_contiguous_region((unsigned long)pgdp, 0);
++#endif
++}
++
++static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
++{
++      pud_t *pud;
++      unsigned long addr;
++      int i;
++
++      if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
++              return;
++
++      pud = pud_offset(pgd, 0);
++      for (addr = i = 0; i < PREALLOCATED_PMDS;
++           i++, pud++, addr += PUD_SIZE) {
++              pmd_t *pmd = pmds[i];
++
++              if (i >= KERNEL_PGD_BOUNDARY)
++                      memcpy(pmd,
++                             (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
++                             sizeof(pmd_t) * PTRS_PER_PMD);
++
++              /* It is safe to poke machine addresses of pmds under the pgd_lock. */
++              pud_populate(mm, pud, pmd);
++      }
++}
++
++static inline pgd_t *user_pgd_alloc(pgd_t *pgd)
++{
++#ifdef CONFIG_X86_64
++      if (pgd) {
++              pgd_t *upgd = (void *)__get_free_page(PGALLOC_GFP);
++
++              if (upgd)
++                      set_page_private(virt_to_page(pgd),
++                                       (unsigned long)upgd);
++              else {
++                      free_page((unsigned long)pgd);
++                      pgd = NULL;
++              }
++      }
++#endif
++      return pgd;
++}
++
++static inline void user_pgd_free(pgd_t *pgd)
++{
++#ifdef CONFIG_X86_64
++      free_page(page_private(virt_to_page(pgd)));
++#endif
++}
++
++pgd_t *pgd_alloc(struct mm_struct *mm)
++{
++      pgd_t *pgd;
++      pmd_t *pmds[PREALLOCATED_PMDS];
++
++      pgd = user_pgd_alloc((void *)__get_free_page(PGALLOC_GFP));
++
++      if (pgd == NULL)
++              goto out;
++
++      mm->pgd = pgd;
++
++      if (preallocate_pmds(pmds, mm) != 0)
++              goto out_free_pgd;
++
++      if (paravirt_pgd_alloc(mm) != 0)
++              goto out_free_pmds;
++
++      /*
++       * Make sure that pre-populating the pmds is atomic with
++       * respect to anything walking the pgd_list, so that they
++       * never see a partially populated pgd.
++       */
++      spin_lock(&pgd_lock);
++
++#ifdef CONFIG_X86_PAE
++      /* Protect against save/restore: move below 4GB under pgd_lock. */
++      if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
++          && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
++              spin_unlock(&pgd_lock);
++              goto out_free_pmds;
++      }
++#endif
++
++      pgd_ctor(mm, pgd);
++      pgd_prepopulate_pmd(mm, pgd, pmds);
++
++      spin_unlock(&pgd_lock);
++
++      return pgd;
++
++out_free_pmds:
++      free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
++out_free_pgd:
++      user_pgd_free(pgd);
++      free_page((unsigned long)pgd);
++out:
++      return NULL;
++}
++
++void pgd_free(struct mm_struct *mm, pgd_t *pgd)
++{
++      /*
++       * After this the pgd should not be pinned for the duration of this
++       * function's execution. We should never sleep and thus never race:
++       *  1. User pmds will not become write-protected under our feet due
++       *     to a concurrent mm_pin_all().
++       *  2. The machine addresses in PGD entries will not become invalid
++       *     due to a concurrent save/restore.
++       */
++      pgd_dtor(pgd);
++
++      pgd_mop_up_pmds(mm, pgd);
++      paravirt_pgd_free(mm, pgd);
++      user_pgd_free(pgd);
++      free_page((unsigned long)pgd);
++}
++
++/* blktap and gntdev need this, as otherwise they would implicitly (and
++ * needlessly, as they never use it) reference init_mm. */
++pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
++                                unsigned long addr, pte_t *ptep, int full)
++{
++      return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm,
++                                     addr, ptep, full);
++}
++EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
++
++int ptep_set_access_flags(struct vm_area_struct *vma,
++                        unsigned long address, pte_t *ptep,
++                        pte_t entry, int dirty)
++{
++      int changed = !pte_same(*ptep, entry);
++
++      if (changed && dirty) {
++              if (likely(vma->vm_mm == current->mm)) {
++                      if (HYPERVISOR_update_va_mapping(address,
++                              entry,
++                              uvm_multi(mm_cpumask(vma->vm_mm))|UVMF_INVLPG))
++                              BUG();
++              } else {
++                      xen_l1_entry_update(ptep, entry);
++                      flush_tlb_page(vma, address);
++              }
++      }
++
++      return changed;
++}
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++int pmdp_set_access_flags(struct vm_area_struct *vma,
++                        unsigned long address, pmd_t *pmdp,
++                        pmd_t entry, int dirty)
++{
++      int changed = !pmd_same(*pmdp, entry);
++
++      VM_BUG_ON(address & ~HPAGE_PMD_MASK);
++
++      if (changed && dirty) {
++              *pmdp = entry;
++              pmd_update_defer(vma->vm_mm, address, pmdp);
++              flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
++      }
++
++      return changed;
++}
++#endif
++
++int ptep_test_and_clear_young(struct vm_area_struct *vma,
++                            unsigned long addr, pte_t *ptep)
++{
++      int ret = 0;
++
++      if (pte_young(*ptep))
++              ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
++                                       (unsigned long *) &ptep->pte);
++
++      if (ret)
++              pte_update(vma->vm_mm, addr, ptep);
++
++      return ret;
++}
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++int pmdp_test_and_clear_young(struct vm_area_struct *vma,
++                            unsigned long addr, pmd_t *pmdp)
++{
++      int ret = 0;
++
++      if (pmd_young(*pmdp))
++              ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
++                                       (unsigned long *)pmdp);
++
++      if (ret)
++              pmd_update(vma->vm_mm, addr, pmdp);
++
++      return ret;
++}
++#endif
++
++int ptep_clear_flush_young(struct vm_area_struct *vma,
++                         unsigned long address, pte_t *ptep)
++{
++      pte_t pte = *ptep;
++      int young = pte_young(pte);
++
++      pte = pte_mkold(pte);
++      if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
++              ptep_set_access_flags(vma, address, ptep, pte, young);
++      else if (young)
++              ptep->pte_low = pte.pte_low;
++
++      return young;
++}
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++int pmdp_clear_flush_young(struct vm_area_struct *vma,
++                         unsigned long address, pmd_t *pmdp)
++{
++      int young;
++
++      VM_BUG_ON(address & ~HPAGE_PMD_MASK);
++
++      young = pmdp_test_and_clear_young(vma, address, pmdp);
++      if (young)
++              flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
++
++      return young;
++}
++
++void pmdp_splitting_flush(struct vm_area_struct *vma,
++                        unsigned long address, pmd_t *pmdp)
++{
++      int set;
++      VM_BUG_ON(address & ~HPAGE_PMD_MASK);
++      set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
++                              (unsigned long *)pmdp);
++      if (set) {
++              pmd_update(vma->vm_mm, address, pmdp);
++              /* need tlb flush only to serialize against gup-fast */
++              flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
++      }
++}
++#endif
++
++/**
++ * reserve_top_address - reserves a hole in the top of kernel address space
++ * @reserve - size of hole to reserve
++ *
++ * Can be used to relocate the fixmap area and poke a hole in the top
++ * of kernel address space to make room for a hypervisor.
++ */
++void __init reserve_top_address(unsigned long reserve)
++{
++#ifdef CONFIG_X86_32
++      BUG_ON(fixmaps_set > 0);
++      printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
++             (int)-reserve);
++      __FIXADDR_TOP = -reserve - PAGE_SIZE;
++#endif
++}
++
++int fixmaps_set;
++
++void xen_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
++{
++      unsigned long address = __fix_to_virt(idx);
++      pte_t pte;
++
++      if (idx >= __end_of_fixed_addresses) {
++              BUG();
++              return;
++      }
++
++      switch (idx) {
++#ifdef CONFIG_X86_64
++      extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
++
++      case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
++              pte = pfn_pte(phys >> PAGE_SHIFT, flags);
++              set_pte_vaddr_pud(level3_user_pgt, address, pte);
++              break;
++      case FIX_EARLYCON_MEM_BASE:
++      case FIX_SHARED_INFO:
++      case FIX_ISAMAP_END ... FIX_ISAMAP_BEGIN:
++              xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
++                                  pfn_pte_ma(phys >> PAGE_SHIFT, flags));
++              fixmaps_set++;
++              return;
++#else
++      case FIX_WP_TEST:
++      case FIX_VDSO:
++              pte = pfn_pte(phys >> PAGE_SHIFT, flags);
++              break;
++#endif
++      default:
++              pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
++              break;
++      }
++      set_pte_vaddr(address, pte);
++      fixmaps_set++;
++}
diff --cc arch/x86/mm/pgtable_32-xen.c

index 0000000,0000000..1adb777

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/mm/pgtable_32-xen.c
@@@ -1,0 -1,0 +1,179 @@@
++#include <linux/sched.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/nmi.h>
++#include <linux/swap.h>
++#include <linux/smp.h>
++#include <linux/highmem.h>
++#include <linux/pagemap.h>
++#include <linux/spinlock.h>
++#include <linux/module.h>
++
++#include <asm/system.h>
++#include <asm/pgtable.h>
++#include <asm/pgalloc.h>
++#include <asm/fixmap.h>
++#include <asm/e820.h>
++#include <asm/tlb.h>
++#include <asm/tlbflush.h>
++#include <asm/io.h>
++#include <asm/mmu_context.h>
++
++#include <xen/features.h>
++#include <asm/hypervisor.h>
++
++unsigned int __VMALLOC_RESERVE = 128 << 20;
++
++/*
++ * Associate a virtual page frame with a given physical page frame
++ * and protection flags for that frame.
++ */
++void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
++{
++#ifndef CONFIG_XEN
++      pgd_t *pgd;
++      pud_t *pud;
++      pmd_t *pmd;
++      pte_t *pte;
++
++      pgd = swapper_pg_dir + pgd_index(vaddr);
++      if (pgd_none(*pgd)) {
++              BUG();
++              return;
++      }
++      pud = pud_offset(pgd, vaddr);
++      if (pud_none(*pud)) {
++              BUG();
++              return;
++      }
++      pmd = pmd_offset(pud, vaddr);
++      if (pmd_none(*pmd)) {
++              BUG();
++              return;
++      }
++      pte = pte_offset_kernel(pmd, vaddr);
++      if (pte_val(pteval))
++              set_pte_at(&init_mm, vaddr, pte, pteval);
++      else
++              pte_clear(&init_mm, vaddr, pte);
++
++      /*
++       * It's enough to flush this one mapping.
++       * (PGE mappings get flushed as well)
++       */
++      __flush_tlb_one(vaddr);
++#else
++      if (HYPERVISOR_update_va_mapping(vaddr, pteval,
++                                       UVMF_INVLPG|UVMF_ALL))
++              BUG();
++#endif
++}
++
++/*
++ * Associate a large virtual page frame with a given physical page frame 
++ * and protection flags for that frame. pfn is for the base of the page,
++ * vaddr is what the page gets mapped to - both must be properly aligned. 
++ * The pmd must already be instantiated. Assumes PAE mode.
++ */ 
++void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
++{
++      pgd_t *pgd;
++      pud_t *pud;
++      pmd_t *pmd;
++
++      if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
++              printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
++              return; /* BUG(); */
++      }
++      if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
++              printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
++              return; /* BUG(); */
++      }
++      pgd = swapper_pg_dir + pgd_index(vaddr);
++      if (pgd_none(*pgd)) {
++              printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
++              return; /* BUG(); */
++      }
++      pud = pud_offset(pgd, vaddr);
++      pmd = pmd_offset(pud, vaddr);
++      set_pmd(pmd, pfn_pmd(pfn, flags));
++      /*
++       * It's enough to flush this one mapping.
++       * (PGE mappings get flushed as well)
++       */
++      __flush_tlb_one(vaddr);
++}
++
++unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
++unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
++EXPORT_SYMBOL(__FIXADDR_TOP);
++
++/*
++ * vmalloc=size forces the vmalloc area to be exactly 'size'
++ * bytes. This can be used to increase (or decrease) the
++ * vmalloc area - the default is 128m.
++ */
++static int __init parse_vmalloc(char *arg)
++{
++      if (!arg)
++              return -EINVAL;
++
++      /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
++      __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
++      return 0;
++}
++early_param("vmalloc", parse_vmalloc);
++
++#ifndef CONFIG_XEN
++/*
++ * reservetop=size reserves a hole at the top of the kernel address space which
++ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
++ * so relocating the fixmap can be done before paging initialization.
++ */
++static int __init parse_reservetop(char *arg)
++{
++      unsigned long address;
++
++      if (!arg)
++              return -EINVAL;
++
++      address = memparse(arg, &arg);
++      reserve_top_address(address);
++      fixup_early_ioremap();
++      return 0;
++}
++early_param("reservetop", parse_reservetop);
++#endif
++
++void make_lowmem_page_readonly(void *va, unsigned int feature)
++{
++      pte_t *pte;
++      unsigned int level;
++      int rc;
++
++      if (xen_feature(feature))
++              return;
++
++      pte = lookup_address((unsigned long)va, &level);
++      BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
++      rc = HYPERVISOR_update_va_mapping(
++              (unsigned long)va, pte_wrprotect(*pte), 0);
++      BUG_ON(rc);
++}
++
++void make_lowmem_page_writable(void *va, unsigned int feature)
++{
++      pte_t *pte;
++      unsigned int level;
++      int rc;
++
++      if (xen_feature(feature))
++              return;
++
++      pte = lookup_address((unsigned long)va, &level);
++      BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
++      rc = HYPERVISOR_update_va_mapping(
++              (unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG);
++      BUG_ON(rc);
++}
diff --cc arch/x86/mm/physaddr.c

index d2e2735,d2e2735..fdb369a
--- 1/arch/x86/mm/physaddr.c
--- 2/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@@ -8,6 -8,6 +8,10 @@@
   
   #ifdef CONFIG_X86_64
   
++#ifdef CONFIG_XEN
++#define phys_base 0
++#endif
++
   unsigned long __phys_addr(unsigned long x)
   {
         if (x >= __START_KERNEL_map) {
diff --cc arch/x86/mm/srat_64.c

index 0000000,8e9d339..d981ad8

mode 000000,100644..100644
--- /dev/null
--- 2/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@@ -1,0 -1,260 +1,264 @@@
+ /*
+  * ACPI 3.0 based NUMA setup
+  * Copyright 2004 Andi Kleen, SuSE Labs.
+  *
+  * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+  *
+  * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+  * Assumes all memory regions belonging to a single proximity domain
+  * are in one chunk. Holes between them will be included in the node.
+  */
+ 
+ #include <linux/kernel.h>
+ #include <linux/acpi.h>
+ #include <linux/mmzone.h>
+ #include <linux/bitmap.h>
+ #include <linux/module.h>
+ #include <linux/topology.h>
+ #include <linux/bootmem.h>
+ #include <linux/memblock.h>
+ #include <linux/mm.h>
+ #include <asm/proto.h>
+ #include <asm/numa.h>
+ #include <asm/e820.h>
+ #include <asm/apic.h>
+ #include <asm/uv/uv.h>
+ 
+ int acpi_numa __initdata;
+ 
+ static struct bootnode nodes_add[MAX_NUMNODES];
+ 
+ static __init int setup_node(int pxm)
+ {
+       return acpi_map_pxm_to_node(pxm);
+ }
+ 
+ static __init void bad_srat(void)
+ {
+       printk(KERN_ERR "SRAT: SRAT not used.\n");
+       acpi_numa = -1;
+       memset(nodes_add, 0, sizeof(nodes_add));
+ }
+ 
+ static __init inline int srat_disabled(void)
+ {
+       return acpi_numa < 0;
+ }
+ 
+ /* Callback for SLIT parsing */
+ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+ {
+       int i, j;
+ 
+       for (i = 0; i < slit->locality_count; i++)
+               for (j = 0; j < slit->locality_count; j++)
+                       numa_set_distance(pxm_to_node(i), pxm_to_node(j),
+                               slit->entry[slit->locality_count * i + j]);
+ }
+ 
+ /* Callback for Proximity Domain -> x2APIC mapping */
+ void __init
+ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+ {
+       int pxm, node;
+       int apic_id;
+ 
+       if (srat_disabled())
+               return;
+       if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
+               bad_srat();
+               return;
+       }
+       if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+               return;
+       pxm = pa->proximity_domain;
+       node = setup_node(pxm);
+       if (node < 0) {
+               printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+               bad_srat();
+               return;
+       }
+ 
+       apic_id = pa->apic_id;
+       if (apic_id >= MAX_LOCAL_APIC) {
+               printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+               return;
+       }
+       set_apicid_to_node(apic_id, node);
+       node_set(node, numa_nodes_parsed);
+       acpi_numa = 1;
+       printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
+              pxm, apic_id, node);
+ }
+ 
+ /* Callback for Proximity Domain -> LAPIC mapping */
+ void __init
+ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
+ {
+       int pxm, node;
+       int apic_id;
+ 
+       if (srat_disabled())
+               return;
+       if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
+               bad_srat();
+               return;
+       }
+       if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+               return;
+       pxm = pa->proximity_domain_lo;
++      if (acpi_srat_revision >= 2)
++              pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
+       node = setup_node(pxm);
+       if (node < 0) {
+               printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+               bad_srat();
+               return;
+       }
+ 
+       if (get_uv_system_type() >= UV_X2APIC)
+               apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
+       else
+               apic_id = pa->apic_id;
+ 
+       if (apic_id >= MAX_LOCAL_APIC) {
+               printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+               return;
+       }
+ 
+       set_apicid_to_node(apic_id, node);
+       node_set(node, numa_nodes_parsed);
+       acpi_numa = 1;
+       printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
+              pxm, apic_id, node);
+ }
+ 
+ #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+ static inline int save_add_info(void) {return 1;}
+ #else
+ static inline int save_add_info(void) {return 0;}
+ #endif
+ /*
+  * Update nodes_add[]
+  * This code supports one contiguous hot add area per node
+  */
+ static void __init
+ update_nodes_add(int node, unsigned long start, unsigned long end)
+ {
+       unsigned long s_pfn = start >> PAGE_SHIFT;
+       unsigned long e_pfn = end >> PAGE_SHIFT;
+       int changed = 0;
+       struct bootnode *nd = &nodes_add[node];
+ 
+       /* I had some trouble with strange memory hotadd regions breaking
+          the boot. Be very strict here and reject anything unexpected.
+          If you want working memory hotadd write correct SRATs.
+ 
+          The node size check is a basic sanity check to guard against
+          mistakes */
+       if ((signed long)(end - start) < NODE_MIN_SIZE) {
+               printk(KERN_ERR "SRAT: Hotplug area too small\n");
+               return;
+       }
+ 
+       /* This check might be a bit too strict, but I'm keeping it for now. */
+       if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
+               printk(KERN_ERR
+                       "SRAT: Hotplug area %lu -> %lu has existing memory\n",
+                       s_pfn, e_pfn);
+               return;
+       }
+ 
+       /* Looks good */
+ 
+       if (nd->start == nd->end) {
+               nd->start = start;
+               nd->end = end;
+               changed = 1;
+       } else {
+               if (nd->start == end) {
+                       nd->start = start;
+                       changed = 1;
+               }
+               if (nd->end == start) {
+                       nd->end = end;
+                       changed = 1;
+               }
+               if (!changed)
+                       printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
+       }
+ 
+       if (changed) {
+               node_set(node, numa_nodes_parsed);
+               printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
+                                nd->start, nd->end);
+       }
+ }
+ 
+ /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+ void __init
+ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
+ {
+       unsigned long start, end;
+       int node, pxm;
+ 
+       if (srat_disabled())
+               return;
+       if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
+               bad_srat();
+               return;
+       }
+       if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+               return;
+ 
+       if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+               return;
+       start = ma->base_address;
+       end = start + ma->length;
+       pxm = ma->proximity_domain;
++      if (acpi_srat_revision <= 1)
++              pxm &= 0xff;
+       node = setup_node(pxm);
+       if (node < 0) {
+               printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+               bad_srat();
+               return;
+       }
+ 
+       if (numa_add_memblk(node, start, end) < 0) {
+               bad_srat();
+               return;
+       }
+ 
+       printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
+              start, end);
+ 
+       if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
+               update_nodes_add(node, start, end);
+ }
+ 
+ void __init acpi_numa_arch_fixup(void) {}
+ 
+ int __init x86_acpi_numa_init(void)
+ {
+       int ret;
+ 
+       ret = acpi_numa_init();
+       if (ret < 0)
+               return ret;
+       return srat_disabled() ? -EINVAL : 0;
+ }
+ 
+ #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
+ int memory_add_physaddr_to_nid(u64 start)
+ {
+       int i, ret = 0;
+ 
+       for_each_node(i)
+               if (nodes_add[i].start <= start && nodes_add[i].end > start)
+                       ret = i;
+ 
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+ #endif
diff --cc arch/x86/oprofile/Makefile

index 446902b,446902b..8cb639d
--- 1/arch/x86/oprofile/Makefile
--- 2/arch/x86/oprofile/Makefile
+++ b/arch/x86/oprofile/Makefile
@@@ -6,7 -6,7 +6,14 @@@ DRIVER_OBJS = $(addprefix ../../../driv
                 oprofilefs.o oprofile_stats.o  \
                 timer_int.o )
   
++ifdef CONFIG_XEN
++XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \
++                       xenoprofile.o)
++oprofile-y                            := $(DRIVER_OBJS) \
++                                         $(XENOPROF_COMMON_OBJS) xenoprof.o
++else
   oprofile-y                            := $(DRIVER_OBJS) init.o backtrace.o
   oprofile-$(CONFIG_X86_LOCAL_APIC)     += nmi_int.o op_model_amd.o \
                                            op_model_ppro.o op_model_p4.o
   oprofile-$(CONFIG_X86_IO_APIC)                += nmi_timer_int.o
++endif
diff --cc arch/x86/oprofile/xenoprof.c

index 0000000,0000000..d5ba13f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/oprofile/xenoprof.c
@@@ -1,0 -1,0 +1,179 @@@
++/**
++ * @file xenoprof.c
++ *
++ * @remark Copyright 2002 OProfile authors
++ * @remark Read the file COPYING
++ *
++ * @author John Levon <levon@movementarian.org>
++ *
++ * Modified by Aravind Menon and Jose Renato Santos for Xen
++ * These modifications are:
++ * Copyright (C) 2005 Hewlett-Packard Co.
++ *
++ * x86-specific part
++ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
++ *                    VA Linux Systems Japan K.K.
++ */
++
++#include <linux/init.h>
++#include <linux/oprofile.h>
++#include <linux/sched.h>
++#include <linux/vmalloc.h>
++#include <asm/pgtable.h>
++
++#include <xen/interface/xen.h>
++#include <xen/interface/xenoprof.h>
++#include <xen/xenoprof.h>
++#include "op_counter.h"
++
++static unsigned int num_events = 0;
++
++void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
++{
++      num_events = init->num_events;
++      /* just in case - make sure we do not overflow event list 
++         (i.e. counter_config list) */
++      if (num_events > OP_MAX_COUNTER) {
++              num_events = OP_MAX_COUNTER;
++              init->num_events = num_events;
++      }
++}
++
++void xenoprof_arch_counter(void)
++{
++      int i;
++      struct xenoprof_counter counter;
++
++      for (i=0; i<num_events; i++) {
++              counter.ind       = i;
++              counter.count     = (uint64_t)counter_config[i].count;
++              counter.enabled   = (uint32_t)counter_config[i].enabled;
++              counter.event     = (uint32_t)counter_config[i].event;
++              counter.kernel    = (uint32_t)counter_config[i].kernel;
++              counter.user      = (uint32_t)counter_config[i].user;
++              counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
++              WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_counter,
++                                             &counter));
++      }
++}
++
++void xenoprof_arch_start(void) 
++{
++      /* nothing */
++}
++
++void xenoprof_arch_stop(void)
++{
++      /* nothing */
++}
++
++void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf)
++{
++      if (sbuf->buffer) {
++              vunmap(sbuf->buffer);
++              sbuf->buffer = NULL;
++      }
++}
++
++int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer,
++                                  struct xenoprof_shared_buffer * sbuf)
++{
++      int npages, ret;
++      struct vm_struct *area;
++
++      sbuf->buffer = NULL;
++      if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) )
++              return ret;
++
++      npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1;
++
++      area = alloc_vm_area(npages * PAGE_SIZE);
++      if (area == NULL)
++              return -ENOMEM;
++
++      if ( (ret = direct_kernel_remap_pfn_range(
++                    (unsigned long)area->addr,
++                    get_buffer->buf_gmaddr >> PAGE_SHIFT,
++                    npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE),
++                    DOMID_SELF)) ) {
++              vunmap(area->addr);
++              return ret;
++      }
++
++      sbuf->buffer = area->addr;
++      return ret;
++}
++
++int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain,
++                            struct xenoprof_shared_buffer * sbuf)
++{
++      int ret;
++      int npages;
++      struct vm_struct *area;
++      pgprot_t prot = __pgprot(_KERNPG_TABLE);
++
++      sbuf->buffer = NULL;
++      ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
++      if (ret)
++              goto out;
++
++      npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1;
++
++      area = alloc_vm_area(npages * PAGE_SIZE);
++      if (area == NULL) {
++              ret = -ENOMEM;
++              goto out;
++      }
++
++      ret = direct_kernel_remap_pfn_range(
++              (unsigned long)area->addr,
++              pdomain->buf_gmaddr >> PAGE_SHIFT,
++              npages * PAGE_SIZE, prot, DOMID_SELF);
++      if (ret) {
++              vunmap(area->addr);
++              goto out;
++      }
++      sbuf->buffer = area->addr;
++
++out:
++      return ret;
++}
++
++struct op_counter_config counter_config[OP_MAX_COUNTER];
++
++int xenoprof_create_files(struct super_block * sb, struct dentry * root)
++{
++      unsigned int i;
++
++      for (i = 0; i < num_events; ++i) {
++              struct dentry * dir;
++              char buf[2];
++ 
++              snprintf(buf, 2, "%d", i);
++              dir = oprofilefs_mkdir(sb, root, buf);
++              oprofilefs_create_ulong(sb, dir, "enabled",
++                                      &counter_config[i].enabled);
++              oprofilefs_create_ulong(sb, dir, "event",
++                                      &counter_config[i].event);
++              oprofilefs_create_ulong(sb, dir, "count",
++                                      &counter_config[i].count);
++              oprofilefs_create_ulong(sb, dir, "unit_mask",
++                                      &counter_config[i].unit_mask);
++              oprofilefs_create_ulong(sb, dir, "kernel",
++                                      &counter_config[i].kernel);
++              oprofilefs_create_ulong(sb, dir, "user",
++                                      &counter_config[i].user);
++      }
++
++      return 0;
++}
++
++int __init oprofile_arch_init(struct oprofile_operations * ops)
++{
++      return xenoprofile_init(ops);
++}
++
++void oprofile_arch_exit(void)
++{
++      xenoprofile_exit();
++}
diff --cc arch/x86/pci/Makefile

index 6b8759f,6b8759f..55c8636
--- 1/arch/x86/pci/Makefile
--- 2/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@@ -5,6 -5,6 +5,9 @@@ obj-$(CONFIG_PCI_MMCONFIG)       += mmconfig_
   obj-$(CONFIG_PCI_DIRECT)      += direct.o
   obj-$(CONFIG_PCI_OLPC)                += olpc.o
   obj-$(CONFIG_PCI_XEN)         += xen.o
++# pcifront should be after mmconfig.o and direct.o as it should only
++# take over if direct access to the PCI bus is unavailable
++obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront.o
   
   obj-y                         += fixup.o
   obj-$(CONFIG_X86_INTEL_CE)      += ce4100.o
diff --cc arch/x86/pci/amd_bus.c

index 026e493,026e493..20d567c
--- 1/arch/x86/pci/amd_bus.c
--- 2/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@@ -350,6 -350,6 +350,7 @@@ static int __init early_fill_mp_bus_inf
   
   #define ENABLE_CF8_EXT_CFG      (1ULL << 46)
   
++#ifndef CONFIG_XEN
   static void __cpuinit enable_pci_io_ecs(void *unused)
   {
         u64 reg;
@@@ -378,6 -378,6 +379,7 @@@ static int __cpuinit amd_cpu_notify(str
   static struct notifier_block __cpuinitdata amd_cpu_notifier = {
         .notifier_call  = amd_cpu_notify,
   };
++#endif /* CONFIG_XEN */
   
   static void __init pci_enable_pci_io_ecs(void)
   {
@@@ -419,10 -419,10 +421,19 @@@ static int __init pci_io_ecs_init(void
         if (early_pci_allowed())
                 pci_enable_pci_io_ecs();
   
++#ifndef CONFIG_XEN
         register_cpu_notifier(&amd_cpu_notifier);
         for_each_online_cpu(cpu)
                 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
                                (void *)(long)cpu);
++#else
++      if (cpu = 1, cpu) {
++              u64 reg;
++              rdmsrl(MSR_AMD64_NB_CFG, reg);
++              if (!(reg & ENABLE_CF8_EXT_CFG))
++                      return 0;
++      }
++#endif
         pci_probe |= PCI_HAS_IO_ECS;
   
         return 0;
@@@ -430,6 -430,6 +441,10 @@@
   
   static int __init amd_postcore_init(void)
   {
++#ifdef CONFIG_XEN
++      if (!is_initial_xendomain())
++              return 0;
++#endif
         if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
                 return 0;
   
diff --cc arch/x86/pci/i386.c

index 494f2e7,494f2e7..b988bcc
--- 1/arch/x86/pci/i386.c
--- 2/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@@ -239,12 -239,12 +239,14 @@@ void __init pcibios_resource_survey(voi
         pcibios_allocate_resources(1);
   
         e820_reserve_resources_late();
++#ifndef CONFIG_XEN
         /*
          * Insert the IO APIC resources after PCI initialization has
          * occurred to handle IO APICS that are mapped in on a BAR in
          * PCI space, but before trying to assign unassigned pci res.
          */
         ioapic_insert_resources();
++#endif
   }
   
   /**
diff --cc arch/x86/pci/irq-xen.c

index 0000000,0000000..3109682

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/pci/irq-xen.c
@@@ -1,0 -1,0 +1,1264 @@@
++/*
++ *    Low-Level PCI Support for PC -- Routing of Interrupts
++ *
++ *    (c) 1999--2000 Martin Mares <mj@ucw.cz>
++ */
++
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include <linux/init.h>
++#include <linux/interrupt.h>
++#include <linux/dmi.h>
++#include <linux/io.h>
++#include <linux/smp.h>
++#include <asm/io_apic.h>
++#include <linux/irq.h>
++#include <linux/acpi.h>
++#include <asm/pci_x86.h>
++
++#define PIRQ_SIGNATURE        (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
++#define PIRQ_VERSION 0x0100
++
++static int broken_hp_bios_irq9;
++static int acer_tm360_irqrouting;
++
++static struct irq_routing_table *pirq_table;
++
++static int pirq_enable_irq(struct pci_dev *dev);
++
++/*
++ * Never use: 0, 1, 2 (timer, keyboard, and cascade)
++ * Avoid using: 13, 14 and 15 (FP error and IDE).
++ * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
++ */
++unsigned int pcibios_irq_mask = 0xfff8;
++
++static int pirq_penalty[16] = {
++      1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
++      0, 0, 0, 0, 1000, 100000, 100000, 100000
++};
++
++struct irq_router {
++      char *name;
++      u16 vendor, device;
++      int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
++      int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
++              int new);
++};
++
++struct irq_router_handler {
++      u16 vendor;
++      int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
++};
++
++int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq;
++void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
++
++/*
++ *  Check passed address for the PCI IRQ Routing Table signature
++ *  and perform checksum verification.
++ */
++
++static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
++{
++      struct irq_routing_table *rt;
++      int i;
++      u8 sum;
++
++      rt = (struct irq_routing_table *) addr;
++      if (rt->signature != PIRQ_SIGNATURE ||
++          rt->version != PIRQ_VERSION ||
++          rt->size % 16 ||
++          rt->size < sizeof(struct irq_routing_table))
++              return NULL;
++      sum = 0;
++      for (i = 0; i < rt->size; i++)
++              sum += addr[i];
++      if (!sum) {
++              DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
++                      rt);
++              return rt;
++      }
++      return NULL;
++}
++
++
++
++/*
++ *  Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
++ */
++
++static struct irq_routing_table * __init pirq_find_routing_table(void)
++{
++      u8 *addr;
++      struct irq_routing_table *rt;
++
++#ifdef CONFIG_XEN
++      if (!is_initial_xendomain())
++              return NULL;
++#endif
++      if (pirq_table_addr) {
++              rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
++              if (rt)
++                      return rt;
++              printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
++      }
++      for (addr = (u8 *) isa_bus_to_virt(0xf0000);
++           addr < (u8 *) isa_bus_to_virt(0x100000);
++           addr += 16) {
++              rt = pirq_check_routing_table(addr);
++              if (rt)
++                      return rt;
++      }
++      return NULL;
++}
++
++/*
++ *  If we have a IRQ routing table, use it to search for peer host
++ *  bridges.  It's a gross hack, but since there are no other known
++ *  ways how to get a list of buses, we have to go this way.
++ */
++
++static void __init pirq_peer_trick(void)
++{
++      struct irq_routing_table *rt = pirq_table;
++      u8 busmap[256];
++      int i;
++      struct irq_info *e;
++
++      memset(busmap, 0, sizeof(busmap));
++      for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
++              e = &rt->slots[i];
++#ifdef DEBUG
++              {
++                      int j;
++                      DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
++                      for (j = 0; j < 4; j++)
++                              DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
++                      DBG("\n");
++              }
++#endif
++              busmap[e->bus] = 1;
++      }
++      for (i = 1; i < 256; i++) {
++              int node;
++              if (!busmap[i] || pci_find_bus(0, i))
++                      continue;
++              node = get_mp_bus_to_node(i);
++              if (pci_scan_bus_on_node(i, &pci_root_ops, node))
++                      printk(KERN_INFO "PCI: Discovered primary peer "
++                             "bus %02x [IRQ]\n", i);
++      }
++      pcibios_last_bus = -1;
++}
++
++/*
++ *  Code for querying and setting of IRQ routes on various interrupt routers.
++ */
++
++void eisa_set_level_irq(unsigned int irq)
++{
++      unsigned char mask = 1 << (irq & 7);
++      unsigned int port = 0x4d0 + (irq >> 3);
++      unsigned char val;
++      static u16 eisa_irq_mask;
++
++      if (irq >= 16 || (1 << irq) & eisa_irq_mask)
++              return;
++
++      eisa_irq_mask |= (1 << irq);
++      printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
++      val = inb(port);
++      if (!(val & mask)) {
++              DBG(KERN_DEBUG " -> edge");
++              outb(val | mask, port);
++      }
++}
++
++/*
++ * Common IRQ routing practice: nibbles in config space,
++ * offset by some magic constant.
++ */
++static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
++{
++      u8 x;
++      unsigned reg = offset + (nr >> 1);
++
++      pci_read_config_byte(router, reg, &x);
++      return (nr & 1) ? (x >> 4) : (x & 0xf);
++}
++
++static void write_config_nybble(struct pci_dev *router, unsigned offset,
++      unsigned nr, unsigned int val)
++{
++      u8 x;
++      unsigned reg = offset + (nr >> 1);
++
++      pci_read_config_byte(router, reg, &x);
++      x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
++      pci_write_config_byte(router, reg, x);
++}
++
++/*
++ * ALI pirq entries are damn ugly, and completely undocumented.
++ * This has been figured out from pirq tables, and it's not a pretty
++ * picture.
++ */
++static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
++
++      WARN_ON_ONCE(pirq > 16);
++      return irqmap[read_config_nybble(router, 0x48, pirq-1)];
++}
++
++static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
++{
++      static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
++      unsigned int val = irqmap[irq];
++
++      WARN_ON_ONCE(pirq > 16);
++      if (val) {
++              write_config_nybble(router, 0x48, pirq-1, val);
++              return 1;
++      }
++      return 0;
++}
++
++/*
++ * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
++ * just a pointer to the config space.
++ */
++static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      u8 x;
++
++      pci_read_config_byte(router, pirq, &x);
++      return (x < 16) ? x : 0;
++}
++
++static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
++{
++      pci_write_config_byte(router, pirq, irq);
++      return 1;
++}
++
++/*
++ * The VIA pirq rules are nibble-based, like ALI,
++ * but without the ugly irq number munging.
++ * However, PIRQD is in the upper instead of lower 4 bits.
++ */
++static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
++}
++
++static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
++{
++      write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
++      return 1;
++}
++
++/*
++ * The VIA pirq rules are nibble-based, like ALI,
++ * but without the ugly irq number munging.
++ * However, for 82C586, nibble map is different .
++ */
++static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
++
++      WARN_ON_ONCE(pirq > 5);
++      return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
++}
++
++static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
++{
++      static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
++
++      WARN_ON_ONCE(pirq > 5);
++      write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
++      return 1;
++}
++
++/*
++ * ITE 8330G pirq rules are nibble-based
++ * FIXME: pirqmap may be { 1, 0, 3, 2 },
++ *      2+3 are both mapped to irq 9 on my system
++ */
++static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
++
++      WARN_ON_ONCE(pirq > 4);
++      return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
++}
++
++static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
++{
++      static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
++
++      WARN_ON_ONCE(pirq > 4);
++      write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
++      return 1;
++}
++
++/*
++ * OPTI: high four bits are nibble pointer..
++ * I wonder what the low bits do?
++ */
++static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      return read_config_nybble(router, 0xb8, pirq >> 4);
++}
++
++static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
++{
++      write_config_nybble(router, 0xb8, pirq >> 4, irq);
++      return 1;
++}
++
++/*
++ * Cyrix: nibble offset 0x5C
++ * 0x5C bits 7:4 is INTB bits 3:0 is INTA
++ * 0x5D bits 7:4 is INTD bits 3:0 is INTC
++ */
++static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      return read_config_nybble(router, 0x5C, (pirq-1)^1);
++}
++
++static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
++{
++      write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
++      return 1;
++}
++
++/*
++ *    PIRQ routing for SiS 85C503 router used in several SiS chipsets.
++ *    We have to deal with the following issues here:
++ *    - vendors have different ideas about the meaning of link values
++ *    - some onboard devices (integrated in the chipset) have special
++ *      links and are thus routed differently (i.e. not via PCI INTA-INTD)
++ *    - different revision of the router have a different layout for
++ *      the routing registers, particularly for the onchip devices
++ *
++ *    For all routing registers the common thing is we have one byte
++ *    per routeable link which is defined as:
++ *             bit 7      IRQ mapping enabled (0) or disabled (1)
++ *             bits [6:4] reserved (sometimes used for onchip devices)
++ *             bits [3:0] IRQ to map to
++ *                 allowed: 3-7, 9-12, 14-15
++ *                 reserved: 0, 1, 2, 8, 13
++ *
++ *    The config-space registers located at 0x41/0x42/0x43/0x44 are
++ *    always used to route the normal PCI INT A/B/C/D respectively.
++ *    Apparently there are systems implementing PCI routing table using
++ *    link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
++ *    We try our best to handle both link mappings.
++ *
++ *    Currently (2003-05-21) it appears most SiS chipsets follow the
++ *    definition of routing registers from the SiS-5595 southbridge.
++ *    According to the SiS 5595 datasheets the revision id's of the
++ *    router (ISA-bridge) should be 0x01 or 0xb0.
++ *
++ *    Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
++ *    Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
++ *    They seem to work with the current routing code. However there is
++ *    some concern because of the two USB-OHCI HCs (original SiS 5595
++ *    had only one). YMMV.
++ *
++ *    Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
++ *
++ *    0x61:   IDEIRQ:
++ *            bits [6:5] must be written 01
++ *            bit 4 channel-select primary (0), secondary (1)
++ *
++ *    0x62:   USBIRQ:
++ *            bit 6 OHCI function disabled (0), enabled (1)
++ *
++ *    0x6a:   ACPI/SCI IRQ: bits 4-6 reserved
++ *
++ *    0x7e:   Data Acq. Module IRQ - bits 4-6 reserved
++ *
++ *    We support USBIRQ (in addition to INTA-INTD) and keep the
++ *    IDE, ACPI and DAQ routing untouched as set by the BIOS.
++ *
++ *    Currently the only reported exception is the new SiS 65x chipset
++ *    which includes the SiS 69x southbridge. Here we have the 85C503
++ *    router revision 0x04 and there are changes in the register layout
++ *    mostly related to the different USB HCs with USB 2.0 support.
++ *
++ *    Onchip routing for router rev-id 0x04 (try-and-error observation)
++ *
++ *    0x60/0x61/0x62/0x63:    1xEHCI and 3xOHCI (companion) USB-HCs
++ *                            bit 6-4 are probably unused, not like 5595
++ */
++
++#define PIRQ_SIS_IRQ_MASK     0x0f
++#define PIRQ_SIS_IRQ_DISABLE  0x80
++#define PIRQ_SIS_USB_ENABLE   0x40
++
++static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      u8 x;
++      int reg;
++
++      reg = pirq;
++      if (reg >= 0x01 && reg <= 0x04)
++              reg += 0x40;
++      pci_read_config_byte(router, reg, &x);
++      return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
++}
++
++static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
++{
++      u8 x;
++      int reg;
++
++      reg = pirq;
++      if (reg >= 0x01 && reg <= 0x04)
++              reg += 0x40;
++      pci_read_config_byte(router, reg, &x);
++      x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
++      x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
++      pci_write_config_byte(router, reg, x);
++      return 1;
++}
++
++
++/*
++ * VLSI: nibble offset 0x74 - educated guess due to routing table and
++ *       config space of VLSI 82C534 PCI-bridge/router (1004:0102)
++ *       Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
++ *       devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
++ *       for the busbridge to the docking station.
++ */
++
++static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      WARN_ON_ONCE(pirq >= 9);
++      if (pirq > 8) {
++              dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
++              return 0;
++      }
++      return read_config_nybble(router, 0x74, pirq-1);
++}
++
++static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
++{
++      WARN_ON_ONCE(pirq >= 9);
++      if (pirq > 8) {
++              dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
++              return 0;
++      }
++      write_config_nybble(router, 0x74, pirq-1, irq);
++      return 1;
++}
++
++/*
++ * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
++ * and Redirect I/O registers (0x0c00 and 0x0c01).  The Index register
++ * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a.  The Redirect
++ * register is a straight binary coding of desired PIC IRQ (low nibble).
++ *
++ * The 'link' value in the PIRQ table is already in the correct format
++ * for the Index register.  There are some special index values:
++ * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
++ * and 0x03 for SMBus.
++ */
++static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      outb(pirq, 0xc00);
++      return inb(0xc01) & 0xf;
++}
++
++static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
++      int pirq, int irq)
++{
++      outb(pirq, 0xc00);
++      outb(irq, 0xc01);
++      return 1;
++}
++
++/* Support for AMD756 PCI IRQ Routing
++ * Jhon H. Caicedo <jhcaiced@osso.org.co>
++ * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
++ * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
++ * The AMD756 pirq rules are nibble-based
++ * offset 0x56 0-3 PIRQA  4-7  PIRQB
++ * offset 0x57 0-3 PIRQC  4-7  PIRQD
++ */
++static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      u8 irq;
++      irq = 0;
++      if (pirq <= 4)
++              irq = read_config_nybble(router, 0x56, pirq - 1);
++      dev_info(&dev->dev,
++               "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n",
++               dev->vendor, dev->device, pirq, irq);
++      return irq;
++}
++
++static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
++{
++      dev_info(&dev->dev,
++               "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n",
++               dev->vendor, dev->device, pirq, irq);
++      if (pirq <= 4)
++              write_config_nybble(router, 0x56, pirq - 1, irq);
++      return 1;
++}
++
++/*
++ * PicoPower PT86C523
++ */
++static int pirq_pico_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
++{
++      outb(0x10 + ((pirq - 1) >> 1), 0x24);
++      return ((pirq - 1) & 1) ? (inb(0x26) >> 4) : (inb(0x26) & 0xf);
++}
++
++static int pirq_pico_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
++                      int irq)
++{
++      unsigned int x;
++      outb(0x10 + ((pirq - 1) >> 1), 0x24);
++      x = inb(0x26);
++      x = ((pirq - 1) & 1) ? ((x & 0x0f) | (irq << 4)) : ((x & 0xf0) | (irq));
++      outb(x, 0x26);
++      return 1;
++}
++
++#ifdef CONFIG_PCI_BIOS
++
++static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
++{
++      struct pci_dev *bridge;
++      int pin = pci_get_interrupt_pin(dev, &bridge);
++      return pcibios_set_irq_routing(bridge, pin - 1, irq);
++}
++
++#endif
++
++static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
++{
++      static struct pci_device_id __initdata pirq_440gx[] = {
++              { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
++              { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
++              { },
++      };
++
++      /* 440GX has a proprietary PIRQ router -- don't use it */
++      if (pci_dev_present(pirq_440gx))
++              return 0;
++
++      switch (device) {
++      case PCI_DEVICE_ID_INTEL_82371FB_0:
++      case PCI_DEVICE_ID_INTEL_82371SB_0:
++      case PCI_DEVICE_ID_INTEL_82371AB_0:
++      case PCI_DEVICE_ID_INTEL_82371MX:
++      case PCI_DEVICE_ID_INTEL_82443MX_0:
++      case PCI_DEVICE_ID_INTEL_82801AA_0:
++      case PCI_DEVICE_ID_INTEL_82801AB_0:
++      case PCI_DEVICE_ID_INTEL_82801BA_0:
++      case PCI_DEVICE_ID_INTEL_82801BA_10:
++      case PCI_DEVICE_ID_INTEL_82801CA_0:
++      case PCI_DEVICE_ID_INTEL_82801CA_12:
++      case PCI_DEVICE_ID_INTEL_82801DB_0:
++      case PCI_DEVICE_ID_INTEL_82801E_0:
++      case PCI_DEVICE_ID_INTEL_82801EB_0:
++      case PCI_DEVICE_ID_INTEL_ESB_1:
++      case PCI_DEVICE_ID_INTEL_ICH6_0:
++      case PCI_DEVICE_ID_INTEL_ICH6_1:
++      case PCI_DEVICE_ID_INTEL_ICH7_0:
++      case PCI_DEVICE_ID_INTEL_ICH7_1:
++      case PCI_DEVICE_ID_INTEL_ICH7_30:
++      case PCI_DEVICE_ID_INTEL_ICH7_31:
++      case PCI_DEVICE_ID_INTEL_TGP_LPC:
++      case PCI_DEVICE_ID_INTEL_ESB2_0:
++      case PCI_DEVICE_ID_INTEL_ICH8_0:
++      case PCI_DEVICE_ID_INTEL_ICH8_1:
++      case PCI_DEVICE_ID_INTEL_ICH8_2:
++      case PCI_DEVICE_ID_INTEL_ICH8_3:
++      case PCI_DEVICE_ID_INTEL_ICH8_4:
++      case PCI_DEVICE_ID_INTEL_ICH9_0:
++      case PCI_DEVICE_ID_INTEL_ICH9_1:
++      case PCI_DEVICE_ID_INTEL_ICH9_2:
++      case PCI_DEVICE_ID_INTEL_ICH9_3:
++      case PCI_DEVICE_ID_INTEL_ICH9_4:
++      case PCI_DEVICE_ID_INTEL_ICH9_5:
++      case PCI_DEVICE_ID_INTEL_EP80579_0:
++      case PCI_DEVICE_ID_INTEL_ICH10_0:
++      case PCI_DEVICE_ID_INTEL_ICH10_1:
++      case PCI_DEVICE_ID_INTEL_ICH10_2:
++      case PCI_DEVICE_ID_INTEL_ICH10_3:
++      case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0:
++      case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1:
++              r->name = "PIIX/ICH";
++              r->get = pirq_piix_get;
++              r->set = pirq_piix_set;
++              return 1;
++      }
++
++      if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN &&
++           device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)
++      ||  (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN &&
++           device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)
++      ||  (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN &&
++           device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) {
++              r->name = "PIIX/ICH";
++              r->get = pirq_piix_get;
++              r->set = pirq_piix_set;
++              return 1;
++      }
++
++      return 0;
++}
++
++static __init int via_router_probe(struct irq_router *r,
++                              struct pci_dev *router, u16 device)
++{
++      /* FIXME: We should move some of the quirk fixup stuff here */
++
++      /*
++       * workarounds for some buggy BIOSes
++       */
++      if (device == PCI_DEVICE_ID_VIA_82C586_0) {
++              switch (router->device) {
++              case PCI_DEVICE_ID_VIA_82C686:
++                      /*
++                       * Asus k7m bios wrongly reports 82C686A
++                       * as 586-compatible
++                       */
++                      device = PCI_DEVICE_ID_VIA_82C686;
++                      break;
++              case PCI_DEVICE_ID_VIA_8235:
++                      /**
++                       * Asus a7v-x bios wrongly reports 8235
++                       * as 586-compatible
++                       */
++                      device = PCI_DEVICE_ID_VIA_8235;
++                      break;
++              case PCI_DEVICE_ID_VIA_8237:
++                      /**
++                       * Asus a7v600 bios wrongly reports 8237
++                       * as 586-compatible
++                       */
++                      device = PCI_DEVICE_ID_VIA_8237;
++                      break;
++              }
++      }
++
++      switch (device) {
++      case PCI_DEVICE_ID_VIA_82C586_0:
++              r->name = "VIA";
++              r->get = pirq_via586_get;
++              r->set = pirq_via586_set;
++              return 1;
++      case PCI_DEVICE_ID_VIA_82C596:
++      case PCI_DEVICE_ID_VIA_82C686:
++      case PCI_DEVICE_ID_VIA_8231:
++      case PCI_DEVICE_ID_VIA_8233A:
++      case PCI_DEVICE_ID_VIA_8235:
++      case PCI_DEVICE_ID_VIA_8237:
++              /* FIXME: add new ones for 8233/5 */
++              r->name = "VIA";
++              r->get = pirq_via_get;
++              r->set = pirq_via_set;
++              return 1;
++      }
++      return 0;
++}
++
++static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
++{
++      switch (device) {
++      case PCI_DEVICE_ID_VLSI_82C534:
++              r->name = "VLSI 82C534";
++              r->get = pirq_vlsi_get;
++              r->set = pirq_vlsi_set;
++              return 1;
++      }
++      return 0;
++}
++
++
++static __init int serverworks_router_probe(struct irq_router *r,
++              struct pci_dev *router, u16 device)
++{
++      switch (device) {
++      case PCI_DEVICE_ID_SERVERWORKS_OSB4:
++      case PCI_DEVICE_ID_SERVERWORKS_CSB5:
++              r->name = "ServerWorks";
++              r->get = pirq_serverworks_get;
++              r->set = pirq_serverworks_set;
++              return 1;
++      }
++      return 0;
++}
++
++static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
++{
++      if (device != PCI_DEVICE_ID_SI_503)
++              return 0;
++
++      r->name = "SIS";
++      r->get = pirq_sis_get;
++      r->set = pirq_sis_set;
++      return 1;
++}
++
++static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
++{
++      switch (device) {
++      case PCI_DEVICE_ID_CYRIX_5520:
++              r->name = "NatSemi";
++              r->get = pirq_cyrix_get;
++              r->set = pirq_cyrix_set;
++              return 1;
++      }
++      return 0;
++}
++
++static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
++{
++      switch (device) {
++      case PCI_DEVICE_ID_OPTI_82C700:
++              r->name = "OPTI";
++              r->get = pirq_opti_get;
++              r->set = pirq_opti_set;
++              return 1;
++      }
++      return 0;
++}
++
++static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
++{
++      switch (device) {
++      case PCI_DEVICE_ID_ITE_IT8330G_0:
++              r->name = "ITE";
++              r->get = pirq_ite_get;
++              r->set = pirq_ite_set;
++              return 1;
++      }
++      return 0;
++}
++
++static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
++{
++      switch (device) {
++      case PCI_DEVICE_ID_AL_M1533:
++      case PCI_DEVICE_ID_AL_M1563:
++              r->name = "ALI";
++              r->get = pirq_ali_get;
++              r->set = pirq_ali_set;
++              return 1;
++      }
++      return 0;
++}
++
++static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
++{
++      switch (device) {
++      case PCI_DEVICE_ID_AMD_VIPER_740B:
++              r->name = "AMD756";
++              break;
++      case PCI_DEVICE_ID_AMD_VIPER_7413:
++              r->name = "AMD766";
++              break;
++      case PCI_DEVICE_ID_AMD_VIPER_7443:
++              r->name = "AMD768";
++              break;
++      default:
++              return 0;
++      }
++      r->get = pirq_amd756_get;
++      r->set = pirq_amd756_set;
++      return 1;
++}
++
++static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
++{
++      switch (device) {
++      case PCI_DEVICE_ID_PICOPOWER_PT86C523:
++              r->name = "PicoPower PT86C523";
++              r->get = pirq_pico_get;
++              r->set = pirq_pico_set;
++              return 1;
++
++      case PCI_DEVICE_ID_PICOPOWER_PT86C523BBP:
++              r->name = "PicoPower PT86C523 rev. BB+";
++              r->get = pirq_pico_get;
++              r->set = pirq_pico_set;
++              return 1;
++      }
++      return 0;
++}
++
++static __initdata struct irq_router_handler pirq_routers[] = {
++      { PCI_VENDOR_ID_INTEL, intel_router_probe },
++      { PCI_VENDOR_ID_AL, ali_router_probe },
++      { PCI_VENDOR_ID_ITE, ite_router_probe },
++      { PCI_VENDOR_ID_VIA, via_router_probe },
++      { PCI_VENDOR_ID_OPTI, opti_router_probe },
++      { PCI_VENDOR_ID_SI, sis_router_probe },
++      { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
++      { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
++      { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
++      { PCI_VENDOR_ID_AMD, amd_router_probe },
++      { PCI_VENDOR_ID_PICOPOWER, pico_router_probe },
++      /* Someone with docs needs to add the ATI Radeon IGP */
++      { 0, NULL }
++};
++static struct irq_router pirq_router;
++static struct pci_dev *pirq_router_dev;
++
++
++/*
++ *    FIXME: should we have an option to say "generic for
++ *    chipset" ?
++ */
++
++static void __init pirq_find_router(struct irq_router *r)
++{
++      struct irq_routing_table *rt = pirq_table;
++      struct irq_router_handler *h;
++
++#ifdef CONFIG_PCI_BIOS
++      if (!rt->signature) {
++              printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
++              r->set = pirq_bios_set;
++              r->name = "BIOS";
++              return;
++      }
++#endif
++
++      /* Default unless a driver reloads it */
++      r->name = "default";
++      r->get = NULL;
++      r->set = NULL;
++
++      DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n",
++          rt->rtr_vendor, rt->rtr_device);
++
++      pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
++      if (!pirq_router_dev) {
++              DBG(KERN_DEBUG "PCI: Interrupt router not found at "
++                      "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
++              return;
++      }
++
++      for (h = pirq_routers; h->vendor; h++) {
++              /* First look for a router match */
++              if (rt->rtr_vendor == h->vendor &&
++                      h->probe(r, pirq_router_dev, rt->rtr_device))
++                      break;
++              /* Fall back to a device match */
++              if (pirq_router_dev->vendor == h->vendor &&
++                      h->probe(r, pirq_router_dev, pirq_router_dev->device))
++                      break;
++      }
++      dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n",
++               pirq_router.name,
++               pirq_router_dev->vendor, pirq_router_dev->device);
++
++      /* The device remains referenced for the kernel lifetime */
++}
++
++static struct irq_info *pirq_get_info(struct pci_dev *dev)
++{
++      struct irq_routing_table *rt = pirq_table;
++      int entries = (rt->size - sizeof(struct irq_routing_table)) /
++              sizeof(struct irq_info);
++      struct irq_info *info;
++
++      for (info = rt->slots; entries--; info++)
++              if (info->bus == dev->bus->number &&
++                      PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
++                      return info;
++      return NULL;
++}
++
++static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
++{
++      u8 pin;
++      struct irq_info *info;
++      int i, pirq, newirq;
++      int irq = 0;
++      u32 mask;
++      struct irq_router *r = &pirq_router;
++      struct pci_dev *dev2 = NULL;
++      char *msg = NULL;
++
++      /* Find IRQ pin */
++      pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
++      if (!pin) {
++              dev_dbg(&dev->dev, "no interrupt pin\n");
++              return 0;
++      }
++
++      if (io_apic_assign_pci_irqs)
++              return 0;
++
++      /* Find IRQ routing entry */
++
++      if (!pirq_table)
++              return 0;
++
++      info = pirq_get_info(dev);
++      if (!info) {
++              dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
++                      'A' + pin - 1);
++              return 0;
++      }
++      pirq = info->irq[pin - 1].link;
++      mask = info->irq[pin - 1].bitmap;
++      if (!pirq) {
++              dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin - 1);
++              return 0;
++      }
++      dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
++              'A' + pin - 1, pirq, mask, pirq_table->exclusive_irqs);
++      mask &= pcibios_irq_mask;
++
++      /* Work around broken HP Pavilion Notebooks which assign USB to
++         IRQ 9 even though it is actually wired to IRQ 11 */
++
++      if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
++              dev->irq = 11;
++              pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
++              r->set(pirq_router_dev, dev, pirq, 11);
++      }
++
++      /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
++      if (acer_tm360_irqrouting && dev->irq == 11 &&
++              dev->vendor == PCI_VENDOR_ID_O2) {
++              pirq = 0x68;
++              mask = 0x400;
++              dev->irq = r->get(pirq_router_dev, dev, pirq);
++              pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
++      }
++
++      /*
++       * Find the best IRQ to assign: use the one
++       * reported by the device if possible.
++       */
++      newirq = dev->irq;
++      if (newirq && !((1 << newirq) & mask)) {
++              if (pci_probe & PCI_USE_PIRQ_MASK)
++                      newirq = 0;
++              else
++                      dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
++                               "%#x; try pci=usepirqmask\n", newirq, mask);
++      }
++      if (!newirq && assign) {
++              for (i = 0; i < 16; i++) {
++                      if (!(mask & (1 << i)))
++                              continue;
++                      if (pirq_penalty[i] < pirq_penalty[newirq] &&
++                              can_request_irq(i, IRQF_SHARED))
++                              newirq = i;
++              }
++      }
++      dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin - 1, newirq);
++
++      /* Check if it is hardcoded */
++      if ((pirq & 0xf0) == 0xf0) {
++              irq = pirq & 0xf;
++              msg = "hardcoded";
++      } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
++      ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
++              msg = "found";
++              eisa_set_level_irq(irq);
++      } else if (newirq && r->set &&
++              (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
++              if (r->set(pirq_router_dev, dev, pirq, newirq)) {
++                      eisa_set_level_irq(newirq);
++                      msg = "assigned";
++                      irq = newirq;
++              }
++      }
++
++      if (!irq) {
++              if (newirq && mask == (1 << newirq)) {
++                      msg = "guessed";
++                      irq = newirq;
++              } else {
++                      dev_dbg(&dev->dev, "can't route interrupt\n");
++                      return 0;
++              }
++      }
++      dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
++
++      /* Update IRQ for all devices with the same pirq value */
++      for_each_pci_dev(dev2) {
++              pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
++              if (!pin)
++                      continue;
++
++              info = pirq_get_info(dev2);
++              if (!info)
++                      continue;
++              if (info->irq[pin - 1].link == pirq) {
++                      /*
++                       * We refuse to override the dev->irq
++                       * information. Give a warning!
++                       */
++                      if (dev2->irq && dev2->irq != irq && \
++                      (!(pci_probe & PCI_USE_PIRQ_MASK) || \
++                      ((1 << dev2->irq) & mask))) {
++#ifndef CONFIG_PCI_MSI
++                              dev_info(&dev2->dev, "IRQ routing conflict: "
++                                       "have IRQ %d, want IRQ %d\n",
++                                       dev2->irq, irq);
++#endif
++                              continue;
++                      }
++                      dev2->irq = irq;
++                      pirq_penalty[irq]++;
++                      if (dev != dev2)
++                              dev_info(&dev->dev, "sharing IRQ %d with %s\n",
++                                       irq, pci_name(dev2));
++              }
++      }
++      return 1;
++}
++
++void __init pcibios_fixup_irqs(void)
++{
++      struct pci_dev *dev = NULL;
++      u8 pin;
++
++      DBG(KERN_DEBUG "PCI: IRQ fixup\n");
++      for_each_pci_dev(dev) {
++              /*
++               * If the BIOS has set an out of range IRQ number, just
++               * ignore it.  Also keep track of which IRQ's are
++               * already in use.
++               */
++              if (dev->irq >= 16) {
++                      dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
++                      dev->irq = 0;
++              }
++              /*
++               * If the IRQ is already assigned to a PCI device,
++               * ignore its ISA use penalty
++               */
++              if (pirq_penalty[dev->irq] >= 100 &&
++                              pirq_penalty[dev->irq] < 100000)
++                      pirq_penalty[dev->irq] = 0;
++              pirq_penalty[dev->irq]++;
++      }
++
++      if (io_apic_assign_pci_irqs)
++              return;
++
++      dev = NULL;
++      for_each_pci_dev(dev) {
++              pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
++              if (!pin)
++                      continue;
++
++              /*
++               * Still no IRQ? Try to lookup one...
++               */
++              if (!dev->irq)
++                      pcibios_lookup_irq(dev, 0);
++      }
++}
++
++/*
++ * Work around broken HP Pavilion Notebooks which assign USB to
++ * IRQ 9 even though it is actually wired to IRQ 11
++ */
++static int __init fix_broken_hp_bios_irq9(const struct dmi_system_id *d)
++{
++      if (!broken_hp_bios_irq9) {
++              broken_hp_bios_irq9 = 1;
++              printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
++                      d->ident);
++      }
++      return 0;
++}
++
++/*
++ * Work around broken Acer TravelMate 360 Notebooks which assign
++ * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
++ */
++static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d)
++{
++      if (!acer_tm360_irqrouting) {
++              acer_tm360_irqrouting = 1;
++              printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
++                      d->ident);
++      }
++      return 0;
++}
++
++static struct dmi_system_id __initdata pciirq_dmi_table[] = {
++      {
++              .callback = fix_broken_hp_bios_irq9,
++              .ident = "HP Pavilion N5400 Series Laptop",
++              .matches = {
++                      DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
++                      DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
++                      DMI_MATCH(DMI_PRODUCT_VERSION,
++                              "HP Pavilion Notebook Model GE"),
++                      DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
++              },
++      },
++      {
++              .callback = fix_acer_tm360_irqrouting,
++              .ident = "Acer TravelMate 36x Laptop",
++              .matches = {
++                      DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
++                      DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
++              },
++      },
++      { }
++};
++
++void __init pcibios_irq_init(void)
++{
++      DBG(KERN_DEBUG "PCI: IRQ init\n");
++
++      if (raw_pci_ops == NULL)
++              return;
++
++      dmi_check_system(pciirq_dmi_table);
++
++      pirq_table = pirq_find_routing_table();
++
++#ifdef CONFIG_PCI_BIOS
++      if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
++              pirq_table = pcibios_get_irq_routing_table();
++#endif
++      if (pirq_table) {
++              pirq_peer_trick();
++              pirq_find_router(&pirq_router);
++              if (pirq_table->exclusive_irqs) {
++                      int i;
++                      for (i = 0; i < 16; i++)
++                              if (!(pirq_table->exclusive_irqs & (1 << i)))
++                                      pirq_penalty[i] += 100;
++              }
++              /*
++               * If we're using the I/O APIC, avoid using the PCI IRQ
++               * routing table
++               */
++              if (io_apic_assign_pci_irqs)
++                      pirq_table = NULL;
++      }
++
++      x86_init.pci.fixup_irqs();
++
++      if (io_apic_assign_pci_irqs && pci_routeirq) {
++              struct pci_dev *dev = NULL;
++              /*
++               * PCI IRQ routing is set up by pci_enable_device(), but we
++               * also do it here in case there are still broken drivers that
++               * don't use pci_enable_device().
++               */
++              printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
++              for_each_pci_dev(dev)
++                      pirq_enable_irq(dev);
++      }
++}
++
++static void pirq_penalize_isa_irq(int irq, int active)
++{
++      /*
++       *  If any ISAPnP device reports an IRQ in its list of possible
++       *  IRQ's, we try to avoid assigning it to PCI devices.
++       */
++      if (irq < 16) {
++              if (active)
++                      pirq_penalty[irq] += 1000;
++              else
++                      pirq_penalty[irq] += 100;
++      }
++}
++
++void pcibios_penalize_isa_irq(int irq, int active)
++{
++#ifdef CONFIG_ACPI
++      if (!acpi_noirq)
++              acpi_penalize_isa_irq(irq, active);
++      else
++#endif
++              pirq_penalize_isa_irq(irq, active);
++}
++
++static int pirq_enable_irq(struct pci_dev *dev)
++{
++      u8 pin;
++
++      pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
++      if (pin && !pcibios_lookup_irq(dev, 1)) {
++              char *msg = "";
++
++              if (!io_apic_assign_pci_irqs && dev->irq)
++                      return 0;
++
++              if (io_apic_assign_pci_irqs) {
++#ifdef CONFIG_X86_IO_APIC
++                      struct pci_dev *temp_dev;
++                      int irq;
++                      struct io_apic_irq_attr irq_attr;
++
++                      irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
++                                              PCI_SLOT(dev->devfn),
++                                              pin - 1, &irq_attr);
++                      /*
++                       * Busses behind bridges are typically not listed in the MP-table.
++                       * In this case we have to look up the IRQ based on the parent bus,
++                       * parent slot, and pin number. The SMP code detects such bridged
++                       * busses itself so we should get into this branch reliably.
++                       */
++                      temp_dev = dev;
++                      while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
++                              struct pci_dev *bridge = dev->bus->self;
++
++                              pin = pci_swizzle_interrupt_pin(dev, pin);
++                              irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
++                                              PCI_SLOT(bridge->devfn),
++                                              pin - 1, &irq_attr);
++                              if (irq >= 0)
++                                      dev_warn(&dev->dev, "using bridge %s "
++                                               "INT %c to get IRQ %d\n",
++                                               pci_name(bridge), 'A' + pin - 1,
++                                               irq);
++                              dev = bridge;
++                      }
++                      dev = temp_dev;
++                      if (irq >= 0) {
++                              io_apic_set_pci_routing(&dev->dev, irq,
++                                                       &irq_attr);
++                              dev->irq = irq;
++                              dev_info(&dev->dev, "PCI->APIC IRQ transform: "
++                                       "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
++                              return 0;
++                      } else
++                              msg = "; probably buggy MP table";
++#endif
++              } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
++                      msg = "";
++              else
++                      msg = "; please try using pci=biosirq";
++
++              /*
++               * With IDE legacy devices the IRQ lookup failure is not
++               * a problem..
++               */
++              if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
++                              !(dev->class & 0x5))
++                      return 0;
++
++              dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
++                       'A' + pin - 1, msg);
++      }
++      return 0;
++}
diff --cc arch/x86/pci/pcifront.c

index 0000000,0000000..5d665f8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/pci/pcifront.c
@@@ -1,0 -1,0 +1,60 @@@
++/*
++ * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
++ *                     to support the Xen PCI Frontend's operation
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/irq.h>
++#include <linux/pci.h>
++#include <asm/acpi.h>
++#include <asm/pci_x86.h>
++#include <xen/evtchn.h>
++
++static int pcifront_enable_irq(struct pci_dev *dev)
++{
++      u8 irq;
++      pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
++      if (!alloc_irq_and_cfg_at(irq, numa_node_id()))
++              return -ENOMEM;
++      evtchn_register_pirq(irq);
++      dev->irq = irq;
++
++      return 0;
++}
++
++extern u8 pci_cache_line_size;
++
++static int __init pcifront_x86_stub_init(void)
++{
++      struct cpuinfo_x86 *c = &boot_cpu_data;
++
++      /* Only install our method if we haven't found real hardware already */
++      if (raw_pci_ops)
++              return 0;
++
++      pr_info("PCI: setting up Xen PCI frontend stub\n");
++
++      /* Copied from arch/i386/pci/common.c */
++      pci_cache_line_size = 32 >> 2;
++      if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
++              pci_cache_line_size = 64 >> 2;  /* K7 & K8 */
++      else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
++              pci_cache_line_size = 128 >> 2; /* P4 */
++
++      /* On x86, we need to disable the normal IRQ routing table and
++       * just ask the backend
++       */
++      pcibios_enable_irq = pcifront_enable_irq;
++      pcibios_disable_irq = NULL;
++
++#ifdef CONFIG_ACPI
++      /* Keep ACPI out of the picture */
++      acpi_noirq = 1;
++#endif
++
++      return 0;
++}
++
++arch_initcall(pcifront_x86_stub_init);
diff --cc arch/x86/platform/sfi/sfi.c

index 7785b72,7785b72..f37ca26
--- 1/arch/x86/platform/sfi/sfi.c
--- 2/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@@ -32,6 -32,6 +32,7 @@@
   #include <asm/apic.h>
   
   #ifdef CONFIG_X86_LOCAL_APIC
++#ifndef CONFIG_XEN
   static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
   
   /* All CPUs enumerated by SFI must be present and enabled */
@@@ -47,6 -47,6 +48,9 @@@ static void __cpuinit mp_sfi_register_l
   
         generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
   }
++#else
++#define mp_sfi_register_lapic(id)
++#endif
   
   static int __init sfi_parse_cpus(struct sfi_table_header *table)
   {
@@@ -86,9 -86,9 +90,12 @@@ static int __init sfi_parse_ioapic(stru
                 pentry++;
         }
   
++#ifndef CONFIG_XEN
         WARN(pic_mode, KERN_WARNING
                 "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
         pic_mode = 0;
++#endif
++
         return 0;
   }
   #endif /* CONFIG_X86_IO_APIC */
diff --cc arch/x86/power/Makefile

index a6a198c,a6a198c..0832f49
--- 1/arch/x86/power/Makefile
--- 2/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@@ -5,3 -5,3 +5,5 @@@ CFLAGS_cpu.o     := $(nostackp
   
   obj-$(CONFIG_PM_SLEEP)                += cpu.o
   obj-$(CONFIG_HIBERNATION)     += hibernate_$(BITS).o hibernate_asm_$(BITS).o
++
++disabled-obj-$(CONFIG_XEN)    := cpu.o
diff --cc arch/x86/vdso/Makefile

index bef0bc9,b6552b1..dd083ad
--- 1/arch/x86/vdso/Makefile
--- 2/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@@ -78,6 -65,6 +65,7 @@@ obj-$(VDSO32-y)                       += vdso32-syms.ld
   vdso32.so-$(VDSO32-y)         += int80
   vdso32.so-$(CONFIG_COMPAT)    += syscall
   vdso32.so-$(VDSO32-y)         += sysenter
++vdso32.so-$(CONFIG_X86_XEN)   += syscall
   
   vdso32-images                 = $(vdso32.so-y:%=vdso32-%.so)
   
diff --cc arch/x86/vdso/vdso32-setup-xen.c

index 0000000,0000000..c686dc1

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/arch/x86/vdso/vdso32-setup-xen.c
@@@ -1,0 -1,0 +1,482 @@@
++/*
++ * (C) Copyright 2002 Linus Torvalds
++ * Portions based on the vdso-randomization code from exec-shield:
++ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
++ *
++ * This file contains the needed initializations to support sysenter.
++ */
++
++#include <linux/init.h>
++#include <linux/smp.h>
++#include <linux/thread_info.h>
++#include <linux/sched.h>
++#include <linux/gfp.h>
++#include <linux/string.h>
++#include <linux/elf.h>
++#include <linux/mm.h>
++#include <linux/err.h>
++#include <linux/module.h>
++
++#include <asm/cpufeature.h>
++#include <asm/msr.h>
++#include <asm/pgtable.h>
++#include <asm/unistd.h>
++#include <asm/elf.h>
++#include <asm/tlbflush.h>
++#include <asm/vdso.h>
++#include <asm/proto.h>
++
++#include <xen/interface/callback.h>
++
++enum {
++      VDSO_DISABLED = 0,
++      VDSO_ENABLED = 1,
++      VDSO_COMPAT = 2,
++};
++
++#ifdef CONFIG_COMPAT_VDSO
++#define VDSO_DEFAULT  VDSO_COMPAT
++#else
++#define VDSO_DEFAULT  VDSO_ENABLED
++#endif
++
++#ifdef CONFIG_X86_64
++#define vdso_enabled                  sysctl_vsyscall32
++#define arch_setup_additional_pages   syscall32_setup_pages
++#endif
++
++/*
++ * This is the difference between the prelinked addresses in the vDSO images
++ * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
++ * in the user address space.
++ */
++#define VDSO_ADDR_ADJUST      (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
++
++/*
++ * Should the kernel map a VDSO page into processes and pass its
++ * address down to glibc upon exec()?
++ */
++unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
++
++static int __init vdso_setup(char *s)
++{
++      vdso_enabled = simple_strtoul(s, NULL, 0);
++
++      return 1;
++}
++
++/*
++ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
++ * behavior on both 64-bit and 32-bit kernels.
++ * On 32-bit kernels, vdso=[012] means the same thing.
++ */
++__setup("vdso32=", vdso_setup);
++
++#ifdef CONFIG_X86_32
++__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
++
++EXPORT_SYMBOL_GPL(vdso_enabled);
++#endif
++
++static __init void reloc_symtab(Elf32_Ehdr *ehdr,
++                              unsigned offset, unsigned size)
++{
++      Elf32_Sym *sym = (void *)ehdr + offset;
++      unsigned nsym = size / sizeof(*sym);
++      unsigned i;
++
++      for(i = 0; i < nsym; i++, sym++) {
++              if (sym->st_shndx == SHN_UNDEF ||
++                  sym->st_shndx == SHN_ABS)
++                      continue;  /* skip */
++
++              if (sym->st_shndx > SHN_LORESERVE) {
++                      printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
++                             sym->st_shndx);
++                      continue;
++              }
++
++              switch(ELF_ST_TYPE(sym->st_info)) {
++              case STT_OBJECT:
++              case STT_FUNC:
++              case STT_SECTION:
++              case STT_FILE:
++                      sym->st_value += VDSO_ADDR_ADJUST;
++              }
++      }
++}
++
++static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
++{
++      Elf32_Dyn *dyn = (void *)ehdr + offset;
++
++      for(; dyn->d_tag != DT_NULL; dyn++)
++              switch(dyn->d_tag) {
++              case DT_PLTGOT:
++              case DT_HASH:
++              case DT_STRTAB:
++              case DT_SYMTAB:
++              case DT_RELA:
++              case DT_INIT:
++              case DT_FINI:
++              case DT_REL:
++              case DT_DEBUG:
++              case DT_JMPREL:
++              case DT_VERSYM:
++              case DT_VERDEF:
++              case DT_VERNEED:
++              case DT_ADDRRNGLO ... DT_ADDRRNGHI:
++                      /* definitely pointers needing relocation */
++                      dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
++                      break;
++
++              case DT_ENCODING ... OLD_DT_LOOS-1:
++              case DT_LOOS ... DT_HIOS-1:
++                      /* Tags above DT_ENCODING are pointers if
++                         they're even */
++                      if (dyn->d_tag >= DT_ENCODING &&
++                          (dyn->d_tag & 1) == 0)
++                              dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
++                      break;
++
++              case DT_VERDEFNUM:
++              case DT_VERNEEDNUM:
++              case DT_FLAGS_1:
++              case DT_RELACOUNT:
++              case DT_RELCOUNT:
++              case DT_VALRNGLO ... DT_VALRNGHI:
++                      /* definitely not pointers */
++                      break;
++
++              case OLD_DT_LOOS ... DT_LOOS-1:
++              case DT_HIOS ... DT_VALRNGLO-1:
++              default:
++                      if (dyn->d_tag > DT_ENCODING)
++                              printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
++                                     dyn->d_tag);
++                      break;
++              }
++}
++
++static __init void relocate_vdso(Elf32_Ehdr *ehdr)
++{
++      Elf32_Phdr *phdr;
++      Elf32_Shdr *shdr;
++      int i;
++
++      BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
++             !elf_check_arch_ia32(ehdr) ||
++             ehdr->e_type != ET_DYN);
++
++      ehdr->e_entry += VDSO_ADDR_ADJUST;
++
++      /* rebase phdrs */
++      phdr = (void *)ehdr + ehdr->e_phoff;
++      for (i = 0; i < ehdr->e_phnum; i++) {
++              phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
++
++              /* relocate dynamic stuff */
++              if (phdr[i].p_type == PT_DYNAMIC)
++                      reloc_dyn(ehdr, phdr[i].p_offset);
++      }
++
++      /* rebase sections */
++      shdr = (void *)ehdr + ehdr->e_shoff;
++      for(i = 0; i < ehdr->e_shnum; i++) {
++              if (!(shdr[i].sh_flags & SHF_ALLOC))
++                      continue;
++
++              shdr[i].sh_addr += VDSO_ADDR_ADJUST;
++
++              if (shdr[i].sh_type == SHT_SYMTAB ||
++                  shdr[i].sh_type == SHT_DYNSYM)
++                      reloc_symtab(ehdr, shdr[i].sh_offset,
++                                   shdr[i].sh_size);
++      }
++}
++
++static struct page *vdso32_pages[1];
++
++#ifdef CONFIG_X86_64
++
++#define       vdso32_sysenter()       (boot_cpu_has(X86_FEATURE_SYSENTER32))
++#define       vdso32_syscall()        (boot_cpu_has(X86_FEATURE_SYSCALL32))
++
++void __cpuinit syscall32_cpu_init(void)
++{
++      static const struct callback_register __cpuinitconst cstar = {
++              .type = CALLBACKTYPE_syscall32,
++              .address = (unsigned long)ia32_cstar_target
++      };
++      static const struct callback_register __cpuinitconst sysenter = {
++              .type = CALLBACKTYPE_sysenter,
++              .address = (unsigned long)ia32_sysenter_target
++      };
++
++      if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
++              setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
++      if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
++              setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
++}
++
++#define compat_uses_vma               1
++
++static inline void map_compat_vdso(int map)
++{
++}
++
++#else  /* CONFIG_X86_32 */
++
++#define vdso32_sysenter()     (boot_cpu_has(X86_FEATURE_SEP))
++#define vdso32_syscall()      (boot_cpu_has(X86_FEATURE_SYSCALL32))
++
++extern asmlinkage void ia32pv_cstar_target(void);
++static const struct callback_register __cpuinitconst cstar = {
++      .type = CALLBACKTYPE_syscall32,
++      .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
++};
++
++void __cpuinit enable_sep_cpu(void)
++{
++      extern asmlinkage void ia32pv_sysenter_target(void);
++      static struct callback_register __cpuinitdata sysenter = {
++              .type = CALLBACKTYPE_sysenter,
++              .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
++      };
++
++      if (vdso32_syscall()) {
++              if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
++                      BUG();
++              return;
++      }
++
++      if (!vdso32_sysenter())
++              return;
++
++      if (xen_feature(XENFEAT_supervisor_mode_kernel))
++              sysenter.address.eip = (unsigned long)ia32_sysenter_target;
++
++      switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
++      case 0:
++              break;
++#if CONFIG_XEN_COMPAT < 0x030200
++      case -ENOSYS:
++              sysenter.type = CALLBACKTYPE_sysenter_deprecated;
++              if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
++                      break;
++#endif
++      default:
++              setup_clear_cpu_cap(X86_FEATURE_SEP);
++              break;
++      }
++}
++
++static struct vm_area_struct gate_vma;
++
++static int __init gate_vma_init(void)
++{
++      gate_vma.vm_mm = NULL;
++      gate_vma.vm_start = FIXADDR_USER_START;
++      gate_vma.vm_end = FIXADDR_USER_END;
++      gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
++      gate_vma.vm_page_prot = __P101;
++      /*
++       * Make sure the vDSO gets into every core dump.
++       * Dumping its contents makes post-mortem fully interpretable later
++       * without matching up the same kernel and hardware config to see
++       * what PC values meant.
++       */
++      gate_vma.vm_flags |= VM_ALWAYSDUMP;
++      return 0;
++}
++
++#define compat_uses_vma               0
++
++static void map_compat_vdso(int map)
++{
++      static int vdso_mapped;
++
++      if (map == vdso_mapped)
++              return;
++
++      vdso_mapped = map;
++
++      __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
++                   map ? PAGE_READONLY_EXEC : PAGE_NONE);
++
++      /* flush stray tlbs */
++      flush_tlb_all();
++}
++
++#endif        /* CONFIG_X86_64 */
++
++int __init sysenter_setup(void)
++{
++      void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
++      const void *vsyscall;
++      size_t vsyscall_len;
++
++      vdso32_pages[0] = virt_to_page(syscall_page);
++
++#ifdef CONFIG_X86_32
++      gate_vma_init();
++
++      if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
++              if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
++                  && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
++                      setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
++              else {
++                      setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
++                      setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
++              }
++      }
++#endif
++      if (vdso32_syscall()) {
++              vsyscall = &vdso32_syscall_start;
++              vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
++      } else if (vdso32_sysenter()){
++              vsyscall = &vdso32_sysenter_start;
++              vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
++      } else {
++              vsyscall = &vdso32_int80_start;
++              vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
++      }
++
++      memcpy(syscall_page, vsyscall, vsyscall_len);
++      relocate_vdso(syscall_page);
++
++      return 0;
++}
++
++/* Setup a VMA at program startup for the vsyscall page */
++int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
++{
++      struct mm_struct *mm = current->mm;
++      unsigned long addr;
++      int ret = 0;
++      bool compat;
++
++      if (vdso_enabled == VDSO_DISABLED)
++              return 0;
++
++      down_write(&mm->mmap_sem);
++
++      /* Test compat mode once here, in case someone
++         changes it via sysctl */
++      compat = (vdso_enabled == VDSO_COMPAT);
++
++      map_compat_vdso(compat);
++
++      if (compat)
++              addr = VDSO_HIGH_BASE;
++      else {
++              addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
++              if (IS_ERR_VALUE(addr)) {
++                      ret = addr;
++                      goto up_fail;
++              }
++      }
++
++      current->mm->context.vdso = (void *)addr;
++
++      if (compat_uses_vma || !compat) {
++              /*
++               * MAYWRITE to allow gdb to COW and set breakpoints
++               *
++               * Make sure the vDSO gets into every core dump.
++               * Dumping its contents makes post-mortem fully
++               * interpretable later without matching up the same
++               * kernel and hardware config to see what PC values
++               * meant.
++               */
++              ret = install_special_mapping(mm, addr, PAGE_SIZE,
++                                            VM_READ|VM_EXEC|
++                                            VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
++                                            VM_ALWAYSDUMP,
++                                            vdso32_pages);
++
++              if (ret)
++                      goto up_fail;
++      }
++
++      current_thread_info()->sysenter_return =
++              VDSO32_SYMBOL(addr, SYSENTER_RETURN);
++
++  up_fail:
++      if (ret)
++              current->mm->context.vdso = NULL;
++
++      up_write(&mm->mmap_sem);
++
++      return ret;
++}
++
++#ifdef CONFIG_X86_64
++
++subsys_initcall(sysenter_setup);
++
++#ifdef CONFIG_SYSCTL
++/* Register vsyscall32 into the ABI table */
++#include <linux/sysctl.h>
++
++static ctl_table abi_table2[] = {
++      {
++              .procname       = "vsyscall32",
++              .data           = &sysctl_vsyscall32,
++              .maxlen         = sizeof(int),
++              .mode           = 0644,
++              .proc_handler   = proc_dointvec
++      },
++      {}
++};
++
++static ctl_table abi_root_table2[] = {
++      {
++              .procname = "abi",
++              .mode = 0555,
++              .child = abi_table2
++      },
++      {}
++};
++
++static __init int ia32_binfmt_init(void)
++{
++      register_sysctl_table(abi_root_table2);
++      return 0;
++}
++__initcall(ia32_binfmt_init);
++#endif
++
++#else  /* CONFIG_X86_32 */
++
++const char *arch_vma_name(struct vm_area_struct *vma)
++{
++      if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
++              return "[vdso]";
++      return NULL;
++}
++
++struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
++{
++      /*
++       * Check to see if the corresponding task was created in compat vdso
++       * mode.
++       */
++      if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
++              return &gate_vma;
++      return NULL;
++}
++
++int in_gate_area(struct mm_struct *mm, unsigned long addr)
++{
++      const struct vm_area_struct *vma = get_gate_vma(mm);
++
++      return vma && addr >= vma->vm_start && addr < vma->vm_end;
++}
++
++int in_gate_area_no_mm(unsigned long addr)
++{
++      return 0;
++}
++
++#endif        /* CONFIG_X86_64 */
diff --cc arch/x86/vdso/vdso32.S

index 2ce5f82,2ce5f82..8d4f773
--- 1/arch/x86/vdso/vdso32.S
--- 2/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@@ -9,7 -9,7 +9,7 @@@ vdso32_int80_end
   
         .globl vdso32_syscall_start, vdso32_syscall_end
   vdso32_syscall_start:
--#ifdef CONFIG_COMPAT
++#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
         .incbin "arch/x86/vdso/vdso32-syscall.so"
   #endif
   vdso32_syscall_end:
diff --cc arch/x86/vdso/vdso32/note.S

index c83f257,c83f257..b6ed8cd
--- 1/arch/x86/vdso/vdso32/note.S
--- 2/arch/x86/vdso/vdso32/note.S
+++ b/arch/x86/vdso/vdso32/note.S
@@@ -13,7 -13,7 +13,7 @@@ ELFNOTE_START(Linux, 0, "a"
         .long LINUX_VERSION_CODE
   ELFNOTE_END
   
--#ifdef CONFIG_XEN
++#if defined(CONFIG_X86_XEN) || defined(CONFIG_PARAVIRT_XEN)
   /*
    * Add a special note telling glibc's dynamic linker a fake hardware
    * flavor that it will use to choose the search path for libraries in the
@@@ -37,8 -37,8 +37,12 @@@
   
   ELFNOTE_START(GNU, 2, "a")
         .long 1                 /* ncaps */
++#ifdef CONFIG_PARAVIRT_XEN
   VDSO32_NOTE_MASK:             /* Symbol used by arch/x86/xen/setup.c */
         .long 0                 /* mask */
++#else
++      .long 1 << VDSO_NOTE_NONEGSEG_BIT /* mask */
++#endif
         .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
   ELFNOTE_END
   #endif
diff --cc arch/x86/vdso/vdso32/syscall.S

index 5415b56,5415b56..0a27d17
--- 1/arch/x86/vdso/vdso32/syscall.S
--- 2/arch/x86/vdso/vdso32/syscall.S
+++ b/arch/x86/vdso/vdso32/syscall.S
@@@ -19,8 -19,8 +19,10 @@@ __kernel_vsyscall
   .Lpush_ebp:
         movl    %ecx, %ebp
         syscall
++#ifndef CONFIG_XEN
         movl    $__USER32_DS, %ecx
         movl    %ecx, %ss
++#endif
         movl    %ebp, %ecx
         popl    %ebp
   .Lpop_ebp:
diff --cc arch/x86/xen/Kconfig

index 5cc821c,5cc821c..e83f3b8
--- 1/arch/x86/xen/Kconfig
--- 2/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@@ -2,7 -2,7 +2,7 @@@
   # This Kconfig describes xen options
   #
   
--config XEN
++config PARAVIRT_XEN
         bool "Xen guest support"
         select PARAVIRT
         select PARAVIRT_CLOCK
@@@ -15,36 -15,36 +15,39 @@@
   
   config XEN_DOM0
         def_bool y
--      depends on XEN && PCI_XEN && SWIOTLB_XEN
++      depends on PARAVIRT_XEN && PCI_XEN && SWIOTLB_XEN
         depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
   
   # Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
   # name in tools.
--config XEN_PRIVILEGED_GUEST
--      def_bool XEN_DOM0
++# This doesn't work together with our identical symbol in drivers/xen/Kconfig
++# (produces a recursive dependency), and renaming it is pointless given that
++# it's meant as a compatibility thing.
++#config XEN_PRIVILEGED_GUEST
++#     def_bool XEN_DOM0
   
   config XEN_PVHVM
         def_bool y
--      depends on XEN
++      depends on PARAVIRT_XEN
         depends on X86_LOCAL_APIC
   
   config XEN_MAX_DOMAIN_MEMORY
          int
          default 128
--       depends on XEN
++       depends on PARAVIRT_XEN
          help
            This only affects the sizing of some bss arrays, the unused
            portions of which are freed.
   
   config XEN_SAVE_RESTORE
          bool
--       depends on XEN
++       depends on PARAVIRT_XEN
          select HIBERNATE_CALLBACKS
          default y
   
   config XEN_DEBUG_FS
         bool "Enable Xen debug and tuning parameters in debugfs"
--      depends on XEN && DEBUG_FS
++      depends on PARAVIRT_XEN && DEBUG_FS
         default n
         help
           Enable statistics output and various tuning options in debugfs.
@@@ -52,7 -52,7 +55,7 @@@
   
   config XEN_DEBUG
         bool "Enable Xen debug checks"
--      depends on XEN
++      depends on PARAVIRT_XEN
         default n
         help
           Enable various WARN_ON checks in the Xen MMU code.
diff --cc arch/x86/xen/enlighten.c

index dd7b88f,e3c6a06..6906e52
--- 1/arch/x86/xen/enlighten.c
--- 2/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@@ -115,8 -115,8 +115,8 @@@ static int have_vcpu_info_placement = 1
   static void clamp_max_cpus(void)
   {
   #ifdef CONFIG_SMP
--      if (setup_max_cpus > MAX_VIRT_CPUS)
--              setup_max_cpus = MAX_VIRT_CPUS;
++      if (setup_max_cpus > XEN_LEGACY_MAX_VCPUS)
++              setup_max_cpus = XEN_LEGACY_MAX_VCPUS;
   #endif
   }
   
@@@ -128,11 -128,11 +128,11 @@@ static void xen_vcpu_setup(int cpu
   
         BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
   
--      if (cpu < MAX_VIRT_CPUS)
++      if (cpu < XEN_LEGACY_MAX_VCPUS)
                 per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
   
         if (!have_vcpu_info_placement) {
--              if (cpu >= MAX_VIRT_CPUS)
++              if (cpu >= XEN_LEGACY_MAX_VCPUS)
                         clamp_max_cpus();
                 return;
         }
diff --cc arch/x86/xen/xen-head.S

index aaa7291,aaa7291..de87595
--- 1/arch/x86/xen/xen-head.S
--- 2/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@@ -1,7 -1,7 +1,7 @@@
   /* Xen-specific pieces of head.S, intended to be included in the right
         place in head.S */
   
--#ifdef CONFIG_XEN
++#ifdef CONFIG_PARAVIRT_XEN
   
   #include <linux/elfnote.h>
   #include <linux/init.h>
@@@ -52,4 -52,4 +52,4 @@@ ENTRY(hypercall_page
         ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   _ASM_PTR __HYPERVISOR_VIRT_START)
         ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   _ASM_PTR 0)
   
--#endif /*CONFIG_XEN */
++#endif /* CONFIG_PARAVIRT_XEN */
diff --cc block/blk.h

index d658628,6126346..4df474d
--- 1/block/blk.h
--- 2/block/blk.h
+++ b/block/blk.h
@@@ -62,28 -62,7 +62,8 @@@ static inline struct request *__elv_nex
                         return rq;
                 }
   
-               /*
-                * Flush request is running and flush request isn't queueable
-                * in the drive, we can hold the queue till flush request is
-                * finished. Even we don't do this, driver can't dispatch next
-                * requests and will requeue them. And this can improve
-                * throughput too. For example, we have request flush1, write1,
-                * flush 2. flush1 is dispatched, then queue is hold, write1
-                * isn't inserted to queue. After flush1 is finished, flush2
-                * will be dispatched. Since disk cache is already clean,
-                * flush2 will be finished very soon, so looks like flush2 is
-                * folded to flush1.
-                * Since the queue is hold, a flag is set to indicate the queue
-                * should be restarted later. Please see flush_end_io() for
-                * details.
-                */
-               if (q->flush_pending_idx != q->flush_running_idx &&
-                               !queue_flush_queueable(q)) {
-                       q->flush_queue_delayed = 1;
-                       return NULL;
-               }
- -              if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
+ +              if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
+ +                  !q->elevator->ops->elevator_dispatch_fn(q, 0))
                         return NULL;
         }
   }
diff --cc block/genhd.c
Simple merge
diff --cc drivers/Makefile

index 09f3232,3f135b6..b512c9a
--- 1/drivers/Makefile
--- 2/drivers/Makefile
+++ b/drivers/Makefile
@@@ -17,12 -17,9 +17,9 @@@ obj-$(CONFIG_SFI)            += sfi
   # was used and do nothing if so
   obj-$(CONFIG_PNP)             += pnp/
   obj-$(CONFIG_ARM_AMBA)                += amba/
- # Many drivers will want to use DMA so this has to be made available
- # really early.
- obj-$(CONFIG_DMA_ENGINE)      += dma/
   
   obj-$(CONFIG_VIRTIO)          += virtio/
--obj-$(CONFIG_XEN)             += xen/
++obj-$(CONFIG_PARAVIRT_XEN)    += xen/
   
   # regulators early, since some subsystems rely on them to initialize
   obj-$(CONFIG_REGULATOR)               += regulator/
@@@ -45,6 -42,6 +42,7 @@@ obj-$(CONFIG_PARPORT)         += parport
   obj-y                         += base/ block/ misc/ mfd/ nfc/
   obj-$(CONFIG_NUBUS)           += nubus/
   obj-y                         += macintosh/
++obj-$(CONFIG_XEN)             += xen/
   obj-$(CONFIG_IDE)             += ide/
   obj-$(CONFIG_SCSI)            += scsi/
   obj-$(CONFIG_ATA)             += ata/
diff --cc drivers/acpi/Kconfig

index de0e3df,3a17ca5..3e8b063
--- 1/drivers/acpi/Kconfig
--- 2/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@@ -184,7 -195,7 +195,7 @@@ config ACPI_DOC
   config ACPI_PROCESSOR
         tristate "Processor"
         select THERMAL
--      select CPU_IDLE
++      select CPU_IDLE if !PROCESSOR_EXTERNAL_CONTROL
         default y
         help
           This driver installs ACPI as the idle handler for Linux and uses
@@@ -308,6 -319,6 +319,7 @@@ config ACPI_PCI_SLO
   config X86_PM_TIMER
         bool "Power Management Timer Support" if EXPERT
         depends on X86
++      depends on !XEN
         default y
         help
           The Power Management Timer is available on all ACPI-capable,
@@@ -336,7 -347,7 +348,7 @@@ config ACPI_CONTAINE
   
   config ACPI_HOTPLUG_MEMORY
         tristate "Memory Hotplug"
--      depends on MEMORY_HOTPLUG
++      depends on MEMORY_HOTPLUG || XEN_PRIVILEGED_GUEST
         default n
         help
           This driver supports ACPI memory hotplug.  The driver
@@@ -369,21 -380,6 +381,15 @@@ config ACPI_HE
           which is used to report some hardware errors notified via
           SCI, mainly the corrected errors.
   
- config ACPI_CUSTOM_METHOD
-       tristate "Allow ACPI methods to be inserted/replaced at run time"
-       depends on DEBUG_FS
-       default n
-       help
-         This debug facility allows ACPI AML methods to me inserted and/or
-         replaced without rebooting the system. For details refer to:
-         Documentation/acpi/method-customizing.txt.
- 
-         NOTE: This option is security sensitive, because it allows arbitrary
-         kernel memory to be written to by root (uid=0) users, allowing them
-         to bypass certain security measures (e.g. if root is not allowed to
-         load additional kernel modules after boot, this feature may be used
-         to override that restriction).
- 
   source "drivers/acpi/apei/Kconfig"
   
++config ACPI_PV_SLEEP
++      bool
++      depends on X86 && XEN && ACPI_SLEEP
++      default y
++
++config PROCESSOR_EXTERNAL_CONTROL
++      bool
++      depends on (X86 || IA64) && XEN
++      default y
   endif # ACPI
diff --cc drivers/acpi/Makefile

index ecb26b4,d113fa5..8d8cf3e
--- 1/drivers/acpi/Makefile
--- 2/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@@ -67,6 -67,6 +67,7 @@@ obj-$(CONFIG_ACPI_EC_DEBUGFS) += ec_sys
   processor-y                   := processor_driver.o processor_throttling.o
   processor-y                   += processor_idle.o processor_thermal.o
   processor-$(CONFIG_CPU_FREQ)  += processor_perflib.o
++processor-$(CONFIG_PROCESSOR_EXTERNAL_CONTROL) += processor_perflib.o processor_extcntl.o
   
   obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
   obj-$(CONFIG_ACPI_IPMI)               += acpi_ipmi.o
diff --cc drivers/acpi/acpi_memhotplug.c

index d985713,d985713..a63bc73
--- 1/drivers/acpi/acpi_memhotplug.c
--- 2/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@@ -88,6 -88,6 +88,14 @@@ struct acpi_memory_device 
   
   static int acpi_hotmem_initialized;
   
++#ifdef CONFIG_XEN
++#include "../xen/core/acpi_memhotplug.c"
++#define memory_add_physaddr_to_nid(start) 0
++#else
++static inline int xen_hotadd_mem_init(void) { return 0; }
++static inline void xen_hotadd_mem_exit(void) {}
++#endif
++
   static acpi_status
   acpi_memory_get_resource(struct acpi_resource *resource, void *context)
   {
@@@ -229,6 -229,6 +237,10 @@@ static int acpi_memory_enable_device(st
                 return result;
         }
   
++#ifdef CONFIG_XEN
++      return xen_hotadd_memory(mem_device);
++#endif
++
         node = acpi_get_node(mem_device->device->handle);
         /*
          * Tell the VM there is more memory here...
@@@ -312,6 -312,6 +324,10 @@@ static int acpi_memory_disable_device(s
         struct acpi_memory_info *info, *n;
   
   
++#ifdef CONFIG_XEN
++      return -EOPNOTSUPP;
++#endif
++
         /*
          * Ask the VM to offline this memory range.
          * Note: Assume that this function returns zero on success
@@@ -531,6 -531,6 +547,10 @@@ static int __init acpi_memory_device_in
         acpi_status status;
   
   
++      result = xen_hotadd_mem_init();
++      if (result < 0)
++              return result;
++
         result = acpi_bus_register_driver(&acpi_memory_device_driver);
   
         if (result < 0)
@@@ -570,6 -570,6 +590,8 @@@ static void __exit acpi_memory_device_e
   
         acpi_bus_unregister_driver(&acpi_memory_device_driver);
   
++      xen_hotadd_mem_exit();
++
         return;
   }
   
diff --cc drivers/acpi/acpica/hwsleep.c

index 2ac28bb,2ac28bb..99646e9
--- 1/drivers/acpi/acpica/hwsleep.c
--- 2/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@@ -236,7 -236,7 +236,11 @@@ acpi_status asmlinkage acpi_enter_sleep
         u32 pm1b_control;
         struct acpi_bit_register_info *sleep_type_reg_info;
         struct acpi_bit_register_info *sleep_enable_reg_info;
++#if !(defined(CONFIG_XEN) && defined(CONFIG_X86))
         u32 in_value;
++#else
++      int err;
++#endif
         struct acpi_object_list arg_list;
         union acpi_object arg;
         acpi_status status;
@@@ -347,6 -347,6 +351,7 @@@
   
         /* Write #2: Write both SLP_TYP + SLP_EN */
   
++#if !(defined(CONFIG_XEN) && defined(CONFIG_X86))
         status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control);
         if (ACPI_FAILURE(status)) {
                 return_ACPI_STATUS(status);
@@@ -386,6 -386,6 +391,15 @@@
                 /* Spin until we wake */
   
         } while (!in_value);
++#else
++      /* PV ACPI just need check hypercall return value */
++      err = acpi_notify_hypervisor_state(sleep_state,
++                      pm1a_control, pm1b_control);
++      if (err) {
++              printk(KERN_ERR "ACPI: Hypervisor failure [%d]\n", err);
++              return_ACPI_STATUS(AE_ERROR);
++      }
++#endif
   
         return_ACPI_STATUS(AE_OK);
   }
@@@ -404,6 -404,6 +418,7 @@@ ACPI_EXPORT_SYMBOL(acpi_enter_sleep_sta
    *              THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED
    *
    ******************************************************************************/
++#ifndef CONFIG_XEN
   acpi_status asmlinkage acpi_enter_sleep_state_s4bios(void)
   {
         u32 in_value;
@@@ -457,6 -457,6 +472,7 @@@
   }
   
   ACPI_EXPORT_SYMBOL(acpi_enter_sleep_state_s4bios)
++#endif
   
   /*******************************************************************************
    *
diff --cc drivers/acpi/pci_irq.c

index f907cfb,f907cfb..08a7ec9
--- 1/drivers/acpi/pci_irq.c
--- 2/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@@ -469,3 -469,3 +469,80 @@@ void acpi_pci_irq_disable(struct pci_de
         dev_info(&dev->dev, "PCI INT %c disabled\n", pin_name(pin));
         acpi_unregister_gsi(gsi);
   }
++
++#if defined(CONFIG_XEN) && defined(CONFIG_PCI)
++static int __init xen_setup_gsi(void)
++{
++      struct pci_dev *dev = NULL;
++
++      if (acpi_noirq)
++              return 0;
++
++      /* Loop body is a clone of acpi_pci_irq_enable(). */
++      for_each_pci_dev(dev) {
++              const struct acpi_prt_entry *entry;
++              int gsi;
++              int triggering = ACPI_LEVEL_SENSITIVE;
++              int polarity = ACPI_ACTIVE_LOW;
++              struct physdev_setup_gsi setup_gsi;
++
++              if (!dev->pin)
++                      continue;
++
++              entry = acpi_pci_irq_lookup(dev, dev->pin);
++              if (!entry) {
++                      /*
++                       * IDE legacy mode controller IRQs are magic. Why do
++                       * compat extensions always make such a nasty mess.
++                       */
++                      if ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE &&
++                          (dev->class & 0x05) == 0)
++                              continue;
++              }
++
++              gsi = entry
++                    ? entry->link
++                      ? acpi_pci_link_allocate_irq(entry->link,
++                                                   entry->index,
++                                                   &triggering, &polarity,
++                                                   NULL)
++                      : entry->index
++                    : -1;
++
++              if (gsi >= 0) {
++                      setup_gsi.gsi = gsi;
++                      setup_gsi.triggering
++                              = (triggering == ACPI_LEVEL_SENSITIVE);
++                      setup_gsi.polarity = (polarity == ACPI_ACTIVE_LOW);
++                      if (HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi,
++                                                &setup_gsi) < 0)
++                              continue;
++
++                      dev_info(&dev->dev, "GSI%d: %s-%s\n", gsi,
++                               triggering == ACPI_LEVEL_SENSITIVE ? "level"
++                                                                  : "edge",
++                               polarity == ACPI_ACTIVE_LOW ? "low" : "high");
++              } else {
++                      /*
++                       * No IRQ known to the ACPI subsystem - maybe the
++                       * BIOS / driver reported one, then use it.
++                       */
++                      dev_warn(&dev->dev, "PCI INT %c: no GSI",
++                               pin_name(dev->pin));
++                      /* Interrupt Line values above 0xF are forbidden */
++                      if (dev->irq > 0 && (dev->irq <= 0xF)) {
++                              pr_cont(" - using IRQ %d\n", dev->irq);
++                              setup_gsi.gsi = dev->irq;
++                              setup_gsi.triggering = 1;
++                              setup_gsi.polarity = 1;
++                              VOID(HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi,
++                                                         &setup_gsi));
++                      } else
++                              pr_cont("\n");
++              }
++      }
++
++      return 0;
++}
++subsys_initcall(xen_setup_gsi);
++#endif
diff --cc drivers/acpi/pci_root.c

index d06078d,f911a2f..e6e3c0d
--- 1/drivers/acpi/pci_root.c
--- 2/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@@ -449,6 -449,6 +449,41 @@@ out
   }
   EXPORT_SYMBOL(acpi_pci_osc_control_set);
   
++#ifdef CONFIG_PCI_GUESTDEV
++#include <linux/sysfs.h>
++
++static ssize_t seg_show(struct device *dev,
++                      struct device_attribute *attr, char *buf)
++{
++      struct list_head *entry;
++
++      list_for_each(entry, &acpi_pci_roots) {
++              struct acpi_pci_root *root;
++              root = list_entry(entry, struct acpi_pci_root, node);
++              if (&root->device->dev == dev)
++                      return sprintf(buf, "%04x\n", root->segment);
++      }
++      return 0;
++}
++static DEVICE_ATTR(seg, 0444, seg_show, NULL);
++
++static ssize_t bbn_show(struct device *dev,
++                      struct device_attribute *attr, char *buf)
++{
++      struct list_head *entry;
++
++      list_for_each(entry, &acpi_pci_roots) {
++              struct acpi_pci_root *root;
++              root = list_entry(entry, struct acpi_pci_root, node);
++              if (&root->device->dev == dev)
++                      return sprintf(buf, "%02x\n",
++                                     (unsigned int)root->secondary.start);
++      }
++      return 0;
++}
++static DEVICE_ATTR(bbn, 0444, bbn_show, NULL);
++#endif
++
   static int __devinit acpi_pci_root_add(struct acpi_device *device)
   {
         unsigned long long segment, bus;
@@@ -596,20 -596,14 +631,21 @@@
                         dev_info(root->bus->bridge,
                                 "ACPI _OSC control (0x%02x) granted\n", flags);
                 } else {
-                       dev_info(root->bus->bridge,
-                               "ACPI _OSC request failed (%s), "
-                               "returned control mask: 0x%02x\n",
-                               acpi_format_exception(status), flags);
-                       pr_info("ACPI _OSC control for PCIe not granted, "
-                               "disabling ASPM\n");
+                       dev_dbg(root->bus->bridge,
+                               "ACPI _OSC request failed (code %d)\n", status);
+                       printk(KERN_INFO "Unable to assume _OSC PCIe control. "
+                               "Disabling ASPM\n");
                         pcie_no_aspm();
                 }
-       } else {
-               dev_info(root->bus->bridge,
-                        "Unable to request _OSC control "
-                        "(_OSC support mask: 0x%02x)\n", flags);
         }
   
++#ifdef CONFIG_PCI_GUESTDEV
++      if (device_create_file(&device->dev, &dev_attr_seg))
++              dev_warn(&device->dev, "could not create seg attr\n");
++      if (device_create_file(&device->dev, &dev_attr_bbn))
++              dev_warn(&device->dev, "could not create bbn attr\n");
++#endif
++
         pci_acpi_add_bus_pm_notifier(device, root->bus);
         if (device->wakeup.flags.run_wake)
                 device_set_run_wake(root->bus->bridge, true);
@@@ -657,3 -651,3 +693,31 @@@ static int __init acpi_pci_root_init(vo
   }
   
   subsys_initcall(acpi_pci_root_init);
++
++#ifdef CONFIG_PCI_GUESTDEV
++int acpi_pci_get_root_seg_bbn(char *hid, char *uid, int *seg, int *bbn)
++{
++      struct list_head *entry;
++
++      list_for_each(entry, &acpi_pci_roots) {
++              struct acpi_pci_root *root;
++
++              root = list_entry(entry, struct acpi_pci_root, node);
++              if (strcmp(acpi_device_hid(root->device), hid))
++                      continue;
++
++              if (!root->device->pnp.unique_id) {
++                      if (strlen(uid))
++                              continue;
++              } else {
++                      if (strcmp(root->device->pnp.unique_id, uid))
++                              continue;
++              }
++
++              *seg = (int)root->segment;
++              *bbn = (int)root->secondary.start;
++              return TRUE;
++      }
++      return FALSE;
++}
++#endif
diff --cc drivers/acpi/processor_core.c

index 02d2a4c,25bf17d..63d34ae
--- 1/drivers/acpi/processor_core.c
--- 2/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@@ -19,6 -19,6 +19,15 @@@
   #define _COMPONENT            ACPI_PROCESSOR_COMPONENT
   ACPI_MODULE_NAME("processor_core");
   
++#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
++/*
++ * External processor control logic may register with its own set of
++ * ops to get ACPI related notification. One example is like VMM.
++ */
++const struct processor_extcntl_ops *processor_extcntl_ops;
++EXPORT_SYMBOL(processor_extcntl_ops);
++#endif
++
   static int __init set_no_mwait(const struct dmi_system_id *id)
   {
         printk(KERN_NOTICE PREFIX "%s detected - "
@@@ -164,27 -165,19 +174,36 @@@ exit
   
   int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
   {
- #ifdef CONFIG_SMP
--      int i;
- #endif
++      int i = 0;
         int apic_id = -1;
   
++      if (type < 0) {
++              if (!processor_cntl_external())
++                      return -1;
++              type = ~type;
++              i = 1;
++      }
++
         apic_id = map_mat_entry(handle, type, acpi_id);
         if (apic_id == -1)
                 apic_id = map_madt_entry(type, acpi_id);
--      if (apic_id == -1)
++      if (apic_id == -1 || i)
                 return apic_id;
   
- #ifdef CONFIG_SMP
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         for_each_possible_cpu(i) {
                 if (cpu_physical_id(i) == apic_id)
                         return i;
         }
+ +#else
-       /* In UP kernel, only processor 0 is valid */
-       if (apic_id == 0)
++      /*
++       * Use of cpu_physical_id() is bogus here. Rather than defining a
++       * stub enforcing a 1:1 mapping, we keep it undefined to catch bad
++       * uses. Return as if there was a 1:1 mapping.
++       */
++      if (apic_id < nr_cpu_ids && cpu_possible(apic_id))
+ +              return apic_id;
+ +#endif
         return -1;
   }
   EXPORT_SYMBOL_GPL(acpi_get_cpuid);
@@@ -221,9 -215,9 +241,11 @@@ static bool __init processor_physically
         }
   
         type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
++      if (processor_cntl_external())
++              type = ~type;
         cpuid = acpi_get_cpuid(handle, type, acpi_id);
   
-       if (cpuid == -1)
+       if ((cpuid == -1) && (num_possible_cpus() > 1))
                 return false;
   
         return true;
diff --cc drivers/acpi/processor_driver.c

index a4e0f1b,a4e0f1b..397bf8f
--- 1/drivers/acpi/processor_driver.c
--- 2/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@@ -82,7 -82,7 +82,7 @@@ MODULE_LICENSE("GPL")
   static int acpi_processor_add(struct acpi_device *device);
   static int acpi_processor_remove(struct acpi_device *device, int type);
   static void acpi_processor_notify(struct acpi_device *device, u32 event);
--static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu);
++static acpi_status acpi_processor_hotadd_init(struct acpi_processor *pr);
   static int acpi_processor_handle_eject(struct acpi_processor *pr);
   
   
@@@ -324,11 -324,11 +324,17 @@@ static int acpi_processor_get_info(stru
          *  they are physically not present.
          */
         if (pr->id == -1) {
--              if (ACPI_FAILURE
--                  (acpi_processor_hotadd_init(pr->handle, &pr->id))) {
++              if (ACPI_FAILURE(acpi_processor_hotadd_init(pr)) &&
++                  acpi_get_cpuid(pr->handle, ~device_declaration,
++                                 pr->acpi_id) < 0) {
                         return -ENODEV;
                 }
         }
++#if defined(CONFIG_SMP) && defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
++      if (pr->id >= setup_max_cpus && pr->id > 0)
++              pr->id = -1;
++#endif
++
         /*
          * On some boxes several processors use the same processor bus id.
          * But they are located in different scope. For example:
@@@ -338,7 -338,7 +344,14 @@@
          * generated as the following format:
          * CPU+CPU ID.
          */
--      sprintf(acpi_device_bid(device), "CPU%X", pr->id);
++      if (pr->id != -1)
++              sprintf(acpi_device_bid(device), "CPU%X", pr->id);
++      else
++              snprintf(acpi_device_bid(device),
++                       ARRAY_SIZE(acpi_device_bid(device)),
++                       "#%0*X",
++                       (int)ARRAY_SIZE(acpi_device_bid(device)) - 2,
++                       pr->acpi_id);
         ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Processor [%d:%d]\n", pr->id,
                           pr->acpi_id));
   
@@@ -370,13 -370,13 +383,17 @@@
          * of /proc/cpuinfo
          */
         status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
--      if (ACPI_SUCCESS(status))
++      if (ACPI_SUCCESS(status) && pr->id != -1)
                 arch_fix_phys_package_id(pr->id, object.integer.value);
   
         return 0;
   }
   
++#ifndef CONFIG_XEN
   static DEFINE_PER_CPU(void *, processor_device_array);
++#else
++static void *processor_device_array[NR_ACPI_CPUS];
++#endif
   
   static void acpi_processor_notify(struct acpi_device *device, u32 event)
   {
@@@ -463,50 -463,50 +480,82 @@@ static int __cpuinit acpi_processor_add
         device->driver_data = pr;
   
         result = acpi_processor_get_info(device);
--      if (result) {
++      if (result ||
++          ((pr->id == -1) && !processor_cntl_external())) {
                 /* Processor is physically not present */
                 return 0;
         }
   
   #ifdef CONFIG_SMP
--      if (pr->id >= setup_max_cpus && pr->id != 0)
--              return 0;
++      if (pr->id >= setup_max_cpus && pr->id != 0) {
++              if (!processor_cntl_external())
++                      return 0;
++              WARN_ON(pr->id != -1);
++      }
   #endif
   
--      BUG_ON((pr->id >= nr_cpu_ids) || (pr->id < 0));
++      BUG_ON(!processor_cntl_external() &&
++             ((pr->id >= nr_cpu_ids) || (pr->id < 0)));
   
         /*
          * Buggy BIOS check
          * ACPI id of processors can be reported wrongly by the BIOS.
          * Don't trust it blindly
          */
++#ifndef CONFIG_XEN
         if (per_cpu(processor_device_array, pr->id) != NULL &&
             per_cpu(processor_device_array, pr->id) != device) {
++#else
++      BUG_ON(pr->acpi_id >= NR_ACPI_CPUS);
++      if (processor_device_array[pr->acpi_id] != NULL &&
++          processor_device_array[pr->acpi_id] != device) {
++#endif
                 printk(KERN_WARNING "BIOS reported wrong ACPI id "
                         "for the processor\n");
                 result = -ENODEV;
                 goto err_free_cpumask;
         }
++#ifndef CONFIG_XEN
         per_cpu(processor_device_array, pr->id) = device;
   
         per_cpu(processors, pr->id) = pr;
++#else
++      processor_device_array[pr->acpi_id] = device;
++      if (pr->id != -1)
++              per_cpu(processors, pr->id) = pr;
++#endif
   
--      sysdev = get_cpu_sysdev(pr->id);
--      if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) {
--              result = -EFAULT;
--              goto err_free_cpumask;
++      if (pr->id != -1) {
++              sysdev = get_cpu_sysdev(pr->id);
++              if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) {
++                      result = -EFAULT;
++                      goto err_free_cpumask;
++              }
         }
   
--#ifdef CONFIG_CPU_FREQ
++#if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
         acpi_processor_ppc_has_changed(pr, 0);
   #endif
--      acpi_processor_get_throttling_info(pr);
--      acpi_processor_get_limit_info(pr);
   
++      /*
++       * pr->id may equal to -1 while processor_cntl_external enabled.
++       * throttle and thermal module don't support this case.
++       * Tx only works when dom0 vcpu == pcpu num by far, as we give
++       * control to dom0.
++       */
++      if (pr->id != -1) {
++              acpi_processor_get_throttling_info(pr);
++              acpi_processor_get_limit_info(pr);
++      }
   
--      if (cpuidle_get_driver() == &acpi_idle_driver)
++      if (cpuidle_get_driver() == &acpi_idle_driver
++          || processor_pm_external())
                 acpi_processor_power_init(pr, device);
   
++      result = processor_extcntl_prepare(pr);
++      if (result)
++              goto err_power_exit;
++
         pr->cdev = thermal_cooling_device_register("Processor", device,
                                                 &processor_cooling_ops);
         if (IS_ERR(pr->cdev)) {
@@@ -556,7 -556,7 +605,7 @@@ static int acpi_processor_remove(struc
   
         pr = acpi_driver_data(device);
   
--      if (pr->id >= nr_cpu_ids)
++      if (!processor_cntl_external() && pr->id >= nr_cpu_ids)
                 goto free;
   
         if (type == ACPI_BUS_REMOVAL_EJECT) {
@@@ -566,7 -566,7 +615,8 @@@
   
         acpi_processor_power_exit(pr, device);
   
--      sysfs_remove_link(&device->dev.kobj, "sysdev");
++      if (pr->id != -1)
++              sysfs_remove_link(&device->dev.kobj, "sysdev");
   
         if (pr->cdev) {
                 sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
@@@ -575,8 -575,8 +625,14 @@@
                 pr->cdev = NULL;
         }
   
++#ifndef CONFIG_XEN
         per_cpu(processors, pr->id) = NULL;
         per_cpu(processor_device_array, pr->id) = NULL;
++#else
++      if (pr->id != -1)
++              per_cpu(processors, pr->id) = NULL;
++      processor_device_array[pr->acpi_id] = NULL;
++#endif
   
   free:
         free_cpumask_var(pr->throttling.shared_cpu_map);
@@@ -632,6 -632,6 +688,10 @@@ int acpi_processor_device_add(acpi_hand
                 return -ENODEV;
         }
   
++      if (processor_cntl_external() && acpi_driver_data(*device))
++              processor_notify_external(acpi_driver_data(*device),
++                      PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD);
++
         return 0;
   }
   
@@@ -661,6 -661,6 +721,10 @@@ static void acpi_processor_hotplug_noti
                                             "Unable to add the device\n");
                         break;
                 }
++              pr = acpi_driver_data(device);
++              if (processor_cntl_external() && pr)
++                      processor_notify_external(pr,
++                                      PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD);
                 break;
         case ACPI_NOTIFY_EJECT_REQUEST:
                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
@@@ -677,6 -677,6 +741,9 @@@
                                     "Driver data is NULL, dropping EJECT\n");
                         return;
                 }
++              if (processor_cntl_external())
++                      processor_notify_external(pr, PROCESSOR_HOTPLUG,
++                                              HOTPLUG_TYPE_REMOVE);
                 break;
         default:
                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
@@@ -721,13 -721,13 +788,26 @@@ processor_walk_namespace_cb(acpi_handl
         return (AE_OK);
   }
   
--static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu)
++static acpi_status acpi_processor_hotadd_init(struct acpi_processor *pr)
   {
++      acpi_handle handle = pr->handle;
++      int *p_cpu = &pr->id;
++
++#ifdef CONFIG_XEN
++      if (xen_pcpu_index(pr->acpi_id, 1) != -1)
++              return AE_OK;
++#endif
   
         if (!is_processor_present(handle)) {
                 return AE_ERROR;
         }
   
++      if (processor_cntl_external()) {
++              processor_notify_external(pr, PROCESSOR_HOTPLUG,
++                                        HOTPLUG_TYPE_ADD);
++              return AE_OK;
++      }
++
         if (acpi_map_lsapic(handle, p_cpu))
                 return AE_ERROR;
   
@@@ -741,6 -741,6 +821,12 @@@
   
   static int acpi_processor_handle_eject(struct acpi_processor *pr)
   {
++      if (processor_cntl_external()) {
++              processor_notify_external(pr, PROCESSOR_HOTPLUG,
++                                        HOTPLUG_TYPE_REMOVE);
++              return (0);
++      }
++
         if (cpu_online(pr->id))
                 cpu_down(pr->id);
   
@@@ -749,7 -749,7 +835,7 @@@
         return (0);
   }
   #else
--static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu)
++static acpi_status acpi_processor_hotadd_init(struct acpi_processor *pr)
   {
         return AE_ERROR;
   }
@@@ -800,6 -800,6 +886,7 @@@ static int __init acpi_processor_init(v
   
         memset(&errata, 0, sizeof(errata));
   
++#ifdef CONFIG_CPU_IDLE
         if (!cpuidle_register_driver(&acpi_idle_driver)) {
                 printk(KERN_DEBUG "ACPI: %s registered with cpuidle\n",
                         acpi_idle_driver.name);
@@@ -807,6 -807,6 +894,7 @@@
                 printk(KERN_DEBUG "ACPI: acpi_idle yielding to %s\n",
                         cpuidle_get_driver()->name);
         }
++#endif
   
         result = acpi_bus_register_driver(&acpi_processor_driver);
         if (result < 0)
diff --cc drivers/acpi/processor_extcntl.c

index 0000000,0000000..e71db44

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/acpi/processor_extcntl.c
@@@ -1,0 -1,0 +1,214 @@@
++/*
++ * processor_extcntl.c - channel to external control logic
++ *
++ *  Copyright (C) 2008, Intel corporation
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or (at
++ *  your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful, but
++ *  WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ *  General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License along
++ *  with this program; if not, write to the Free Software Foundation, Inc.,
++ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
++ *
++ */
++
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/acpi.h>
++#include <linux/pm.h>
++#include <linux/cpu.h>
++
++#include <acpi/processor.h>
++
++#define ACPI_PROCESSOR_CLASS            "processor"
++#define _COMPONENT              ACPI_PROCESSOR_COMPONENT
++ACPI_MODULE_NAME("processor_extcntl")
++
++static int processor_extcntl_parse_csd(struct acpi_processor *pr);
++static int processor_extcntl_get_performance(struct acpi_processor *pr);
++
++static int processor_notify_smm(void)
++{
++      acpi_status status;
++      static int is_done = 0;
++
++      /* only need successfully notify BIOS once */
++      /* avoid double notification which may lead to unexpected result */
++      if (is_done)
++              return 0;
++
++      /* Can't write pstate_cnt to smi_cmd if either value is zero */
++      if (!acpi_gbl_FADT.smi_command || !acpi_gbl_FADT.pstate_control) {
++              ACPI_DEBUG_PRINT((ACPI_DB_INFO,"No SMI port or pstate_cnt\n"));
++              return 0;
++      }
++
++      ACPI_DEBUG_PRINT((ACPI_DB_INFO,
++              "Writing pstate_cnt [0x%x] to smi_cmd [0x%x]\n",
++              acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
++
++      status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
++                                  acpi_gbl_FADT.pstate_control, 8);
++      if (ACPI_FAILURE(status))
++              return status;
++
++      is_done = 1;
++
++      return 0;
++}
++
++int processor_notify_external(struct acpi_processor *pr, int event, int type)
++{
++      int ret = -EINVAL;
++
++      if (!processor_cntl_external())
++              return -EINVAL;
++
++      switch (event) {
++      case PROCESSOR_PM_INIT:
++      case PROCESSOR_PM_CHANGE:
++              if ((type >= PM_TYPE_MAX) ||
++                      !processor_extcntl_ops->pm_ops[type])
++                      break;
++
++              ret = processor_extcntl_ops->pm_ops[type](pr, event);
++              break;
++#ifdef CONFIG_ACPI_HOTPLUG_CPU
++      case PROCESSOR_HOTPLUG:
++              if (processor_extcntl_ops->hotplug)
++                      ret = processor_extcntl_ops->hotplug(pr, type);
++              xen_pcpu_hotplug(type);
++              break;
++#endif
++      default:
++              pr_err("Unsupported processor event %d.\n", event);
++              break;
++      }
++
++      return ret;
++}
++
++/*
++ * This is called from ACPI processor init, and targeted to hold
++ * some tricky housekeeping jobs to satisfy external control model.
++ * For example, we may put dependency parse stub here for idle
++ * and performance state. Those information may be not available
++ * if splitting from dom0 control logic like cpufreq driver.
++ */
++int processor_extcntl_prepare(struct acpi_processor *pr)
++{
++      /* parse cstate dependency information */
++      if (processor_pm_external())
++              processor_extcntl_parse_csd(pr);
++
++      /* Initialize performance states */
++      if (processor_pmperf_external())
++              processor_extcntl_get_performance(pr);
++
++      return 0;
++}
++
++/*
++ * Currently no _CSD is implemented which is why existing ACPI code
++ * doesn't parse _CSD at all. But to keep interface complete with
++ * external control logic, we put a placeholder here for future
++ * compatibility.
++ */
++static int processor_extcntl_parse_csd(struct acpi_processor *pr)
++{
++      int i;
++
++      for (i = 0; i < pr->power.count; i++) {
++              if (!pr->power.states[i].valid)
++                      continue;
++
++              /* No dependency by default */
++              pr->power.states[i].domain_info = NULL;
++              pr->power.states[i].csd_count = 0;
++      }
++
++      return 0;
++}
++
++/*
++ * Existing ACPI module does parse performance states at some point,
++ * when acpi-cpufreq driver is loaded which however is something
++ * we'd like to disable to avoid confliction with external control
++ * logic. So we have to collect raw performance information here
++ * when ACPI processor object is found and started.
++ */
++static int processor_extcntl_get_performance(struct acpi_processor *pr)
++{
++      int ret;
++      struct acpi_processor_performance *perf;
++      struct acpi_psd_package *pdomain;
++
++      if (pr->performance)
++              return -EBUSY;
++
++      perf = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL);
++      if (!perf)
++              return -ENOMEM;
++
++      pr->performance = perf;
++      /* Get basic performance state information */
++      ret = acpi_processor_get_performance_info(pr);
++      if (ret < 0)
++              goto err_out;
++
++      /*
++       * Well, here we need retrieve performance dependency information
++       * from _PSD object. The reason why existing interface is not used
++       * is due to the reason that existing interface sticks to Linux cpu
++       * id to construct some bitmap, however we want to split ACPI
++       * processor objects from Linux cpu id logic. For example, even
++       * when Linux is configured as UP, we still want to parse all ACPI
++       * processor objects to external logic. In this case, it's preferred
++       * to use ACPI ID instead.
++       */
++      pdomain = &pr->performance->domain_info;
++      pdomain->num_processors = 0;
++      ret = acpi_processor_get_psd(pr);
++      if (ret < 0) {
++              /*
++               * _PSD is optional - assume no coordination if absent (or
++               * broken), matching native kernels' behavior.
++               */
++              pdomain->num_entries = ACPI_PSD_REV0_ENTRIES;
++              pdomain->revision = ACPI_PSD_REV0_REVISION;
++              pdomain->domain = pr->acpi_id;
++              pdomain->coord_type = DOMAIN_COORD_TYPE_SW_ALL;
++              pdomain->num_processors = 1;
++      }
++
++      /* Some sanity check */
++      if ((pdomain->revision != ACPI_PSD_REV0_REVISION) ||
++          (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES) ||
++          ((pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ALL) &&
++           (pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ANY) &&
++           (pdomain->coord_type != DOMAIN_COORD_TYPE_HW_ALL))) {
++              ret = -EINVAL;
++              goto err_out;
++      }
++
++      /* Last step is to notify BIOS that external logic exists */
++      processor_notify_smm();
++
++      processor_notify_external(pr, PROCESSOR_PM_INIT, PM_TYPE_PERF);
++
++      return 0;
++err_out:
++      pr->performance = NULL;
++      kfree(perf);
++      return ret;
++}
diff --cc drivers/acpi/processor_idle.c

index 431ab11,d615b7d..90303e7
--- 1/drivers/acpi/processor_idle.c
--- 2/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@@ -125,6 -125,6 +125,7 @@@ static struct dmi_system_id __cpuinitda
   };
   
   
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
   /*
    * Callers should disable interrupts before the call and enable
    * interrupts after return.
@@@ -143,6 -143,6 +144,7 @@@ static void acpi_safe_halt(void
         }
         current_thread_info()->status |= TS_POLLING;
   }
++#endif
   
   #ifdef ARCH_APICTIMER_STOPS_ON_C3
   
@@@ -213,7 -213,7 +215,7 @@@ static void lapic_timer_state_broadcast
   static void lapic_timer_check_state(int state, struct acpi_processor *pr,
                                    struct acpi_processor_cx *cstate) { }
   static void lapic_timer_propagate_broadcast(struct acpi_processor *pr) { }
--static void lapic_timer_state_broadcast(struct acpi_processor *pr,
++static inline void lapic_timer_state_broadcast(struct acpi_processor *pr,
                                        struct acpi_processor_cx *cx,
                                        int broadcast)
   {
@@@ -261,7 -261,7 +263,7 @@@ int acpi_processor_resume(struct acpi_d
         return 0;
   }
   
--#if defined(CONFIG_X86)
++#if defined(CONFIG_X86) && !defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
   static void tsc_check_state(int state)
   {
         switch (boot_cpu_data.x86_vendor) {
@@@ -458,7 -458,7 +460,8 @@@ static int acpi_processor_get_power_inf
                                  */
                                 cx.entry_method = ACPI_CSTATE_HALT;
                                 snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
--                      } else {
++                      /* This doesn't apply to external control case */
++                      } else if (!processor_pm_external()) {
                                 continue;
                         }
                         if (cx.type == ACPI_STATE_C1 &&
@@@ -497,6 -497,6 +500,12 @@@
   
                 cx.power = obj->integer.value;
   
++#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
++              /* cache control methods to notify external logic */
++              if (processor_pm_external())
++                      memcpy(&cx.reg, reg, sizeof(*reg));
++#endif
++
                 current_count++;
                 memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
   
@@@ -614,7 -614,7 +623,9 @@@ static int acpi_processor_power_verify(
         unsigned int i;
         unsigned int working = 0;
   
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         pr->power.timer_broadcast_on_state = INT_MAX;
++#endif
   
         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
                 struct acpi_processor_cx *cx = &pr->power.states[i];
@@@ -686,6 -686,6 +697,7 @@@ static int acpi_processor_get_power_inf
         return 0;
   }
   
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
   /**
    * acpi_idle_bm_check - checks if bus master activity was detected
    */
@@@ -1056,6 -1056,6 +1068,7 @@@ static int acpi_processor_setup_cpuidle
   
         return 0;
   }
++#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
   
   int acpi_processor_cst_has_changed(struct acpi_processor *pr)
   {
@@@ -1074,13 -1074,13 +1087,23 @@@
         if (!pr->flags.power_setup_done)
                 return -ENODEV;
   
++      if (processor_pm_external()) {
++              pr->flags.power = 0;
++              ret = acpi_processor_get_power_info(pr);
++              processor_notify_external(pr,
++                      PROCESSOR_PM_CHANGE, PM_TYPE_IDLE);
++              return ret;
++      }
++
         cpuidle_pause_and_lock();
         cpuidle_disable_device(&pr->power.dev);
         acpi_processor_get_power_info(pr);
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         if (pr->flags.power) {
                 acpi_processor_setup_cpuidle(pr);
                 ret = cpuidle_enable_device(&pr->power.dev);
         }
++#endif
         cpuidle_resume_and_unlock();
   
         return ret;
@@@ -1120,6 -1120,6 +1143,7 @@@ int __cpuinit acpi_processor_power_init
         acpi_processor_get_power_info(pr);
         pr->flags.power_setup_done = 1;
   
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         /*
          * Install the idle handler if processor power management is supported.
          * Note that we use previously set idle handler will be used on
@@@ -1130,6 -1130,6 +1154,12 @@@
                 if (cpuidle_register_device(&pr->power.dev))
                         return -EIO;
         }
++#endif
++
++      if (processor_pm_external())
++              processor_notify_external(pr,
++                      PROCESSOR_PM_INIT, PM_TYPE_IDLE);
++
         return 0;
   }
   
diff --cc drivers/acpi/processor_perflib.c

index 85b3237,3a73a93..b3a2673
--- 1/drivers/acpi/processor_perflib.c
--- 2/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@@ -75,6 -79,6 +79,7 @@@ MODULE_PARM_DESC(ignore_ppc, "If the fr
   
   static int acpi_processor_ppc_status;
   
++#ifdef CONFIG_CPU_FREQ
   static int acpi_processor_ppc_notifier(struct notifier_block *nb,
                                        unsigned long event, void *data)
   {
@@@ -117,6 -121,6 +122,7 @@@
   static struct notifier_block acpi_ppc_notifier_block = {
         .notifier_call = acpi_processor_ppc_notifier,
   };
++#endif        /* CONFIG_CPU_FREQ */
   
   static int acpi_processor_get_platform_limit(struct acpi_processor *pr)
   {
@@@ -181,6 -185,6 +187,12 @@@ int acpi_processor_ppc_has_changed(stru
   {
         int ret;
   
++#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
++      /* Xen hypervisor can handle cpufreq _PPC event */
++      if (ignore_ppc < 0 && processor_pmperf_external())
++              ignore_ppc = 0;
++#endif
++
         if (ignore_ppc) {
                 /*
                  * Only when it is notification event, the _OST object
@@@ -205,7 -209,7 +217,12 @@@
         if (ret < 0)
                 return (ret);
         else
++#ifdef CONFIG_CPU_FREQ
                 return cpufreq_update_policy(pr->id);
++#elif defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
++              return processor_notify_external(pr,
++                              PROCESSOR_PM_CHANGE, PM_TYPE_PERF);
++#endif
   }
   
   int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
@@@ -221,6 -225,6 +238,7 @@@
   }
   EXPORT_SYMBOL(acpi_processor_get_bios_limit);
   
++#ifdef CONFIG_CPU_FREQ
   void acpi_processor_ppc_init(void)
   {
         if (!cpufreq_register_notifier
@@@ -239,6 -243,6 +257,7 @@@ void acpi_processor_ppc_exit(void
   
         acpi_processor_ppc_status &= ~PPC_REGISTERED;
   }
++#endif        /* CONFIG_CPU_FREQ */
   
   static int acpi_processor_get_performance_control(struct acpi_processor *pr)
   {
@@@ -386,7 -390,7 +405,10 @@@ static int acpi_processor_get_performan
         return result;
   }
   
--static int acpi_processor_get_performance_info(struct acpi_processor *pr)
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
++static
++#endif
++int acpi_processor_get_performance_info(struct acpi_processor *pr)
   {
         int result = 0;
         acpi_status status = AE_OK;
@@@ -431,6 -435,6 +453,7 @@@
         return result;
   }
   
++#ifdef CONFIG_CPU_FREQ
   int acpi_processor_notify_smm(struct module *calling_module)
   {
         acpi_status status;
@@@ -491,8 -495,8 +514,12 @@@
   }
   
   EXPORT_SYMBOL(acpi_processor_notify_smm);
++#endif        /* CONFIG_CPU_FREQ */
   
--static int acpi_processor_get_psd(struct acpi_processor       *pr)
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
++static
++#endif
++int acpi_processor_get_psd(struct acpi_processor *pr)
   {
         int result = 0;
         acpi_status status = AE_OK;
@@@ -557,6 -561,6 +584,8 @@@ end
         return result;
   }
   
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
++
   int acpi_processor_preregister_performance(
                 struct acpi_processor_performance __percpu *performance)
   {
@@@ -772,3 -776,3 +801,5 @@@ acpi_processor_unregister_performance(s
   }
   
   EXPORT_SYMBOL(acpi_processor_unregister_performance);
++
++#endif /* !CONFIG_PROCESSOR_EXTERNAL_CONTROL */
diff --cc drivers/acpi/scan.c

index 449c556,449c556..653c6d2
--- 1/drivers/acpi/scan.c
--- 2/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@@ -175,6 -175,6 +175,16 @@@ acpi_device_hid_show(struct device *dev
   }
   static DEVICE_ATTR(hid, 0444, acpi_device_hid_show, NULL);
   
++#ifdef CONFIG_PCI_GUESTDEV
++static ssize_t
++acpi_device_uid_show(struct device *dev, struct device_attribute *attr, char *buf) {
++      struct acpi_device *acpi_dev = to_acpi_device(dev);
++
++      return sprintf(buf, "%s\n", acpi_dev->pnp.unique_id);
++}
++static DEVICE_ATTR(uid, 0444, acpi_device_uid_show, NULL);
++#endif
++
   static ssize_t
   acpi_device_path_show(struct device *dev, struct device_attribute *attr, char *buf) {
         struct acpi_device *acpi_dev = to_acpi_device(dev);
@@@ -217,6 -217,6 +227,13 @@@ static int acpi_device_setup_files(stru
                         goto end;
         }
   
++#ifdef CONFIG_PCI_GUESTDEV
++      if(dev->pnp.unique_id) {
++              result = device_create_file(&dev->dev, &dev_attr_uid);
++              if(result)
++                      goto end;
++      }
++#endif
           /*
            * If device has _EJ0, 'eject' file is created that is used to trigger
            * hot-removal function from userland.
@@@ -280,6 -280,6 +297,9 @@@ static void acpi_free_ids(struct acpi_d
                 kfree(id->id);
                 kfree(id);
         }
++#ifdef CONFIG_PCI_GUESTDEV
++      kfree(device->pnp.unique_id);
++#endif
   }
   
   static void acpi_device_release(struct device *dev)
@@@ -1134,6 -1134,6 +1154,11 @@@ static void acpi_device_set_id(struct a
                         for (i = 0; i < cid_list->count; i++)
                                 acpi_add_id(device, cid_list->ids[i].string);
                 }
++#ifdef CONFIG_PCI_GUESTDEV
++              if (info->valid & ACPI_VALID_UID)
++                      device->pnp.unique_id = kstrdup(info->unique_id.string,
++                                                      GFP_KERNEL);
++#endif
                 if (info->valid & ACPI_VALID_ADR) {
                         device->pnp.bus_address = info->address;
                         device->flags.bus_address = 1;
diff --cc drivers/acpi/sleep.c

index 6c94960,6c94960..752d8ee
--- 1/drivers/acpi/sleep.c
--- 2/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@@ -61,6 -61,6 +61,7 @@@ static struct notifier_block tts_notifi
   static int acpi_sleep_prepare(u32 acpi_state)
   {
   #ifdef CONFIG_ACPI_SLEEP
++#ifndef CONFIG_ACPI_PV_SLEEP
         /* do we have a wakeup address for S2 and S3? */
         if (acpi_state == ACPI_STATE_S3) {
                 if (!acpi_wakeup_address) {
@@@ -70,6 -70,6 +71,7 @@@
                                 (acpi_physical_address)acpi_wakeup_address);
   
         }
++#endif
         ACPI_FLUSH_CPU_CACHE();
   #endif
         printk(KERN_INFO PREFIX "Preparing to enter system sleep state S%d\n",
diff --cc drivers/base/cpu.c

index 251acea,251acea..24d71fd
--- 1/drivers/base/cpu.c
--- 2/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@@ -106,7 -106,7 +106,7 @@@ static inline void register_cpu_control
   }
   #endif /* CONFIG_HOTPLUG_CPU */
   
--#ifdef CONFIG_KEXEC
++#if defined(CONFIG_KEXEC) && !defined(CONFIG_XEN)
   #include <linux/kexec.h>
   
   static ssize_t show_crash_notes(struct sys_device *dev, struct sysdev_attribute *attr,
@@@ -231,7 -231,7 +231,7 @@@ int __cpuinit register_cpu(struct cpu *
         if (!error)
                 register_cpu_under_node(num, cpu_to_node(num));
   
--#ifdef CONFIG_KEXEC
++#if defined(CONFIG_KEXEC) && !defined(CONFIG_XEN)
         if (!error)
                 error = sysdev_create_file(&cpu->sysdev, &attr_crash_notes);
   #endif
diff --cc drivers/block/Kconfig

index 717d6e4,83c32cb..d10ea54
--- 1/drivers/block/Kconfig
--- 2/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@@ -460,9 -460,9 +460,9 @@@ config XILINX_SYSAC
         help
           Include support for the Xilinx SystemACE CompactFlash interface
   
--config XEN_BLKDEV_FRONTEND
++config PARAVIRT_XEN_BLKDEV_FRONTEND
         tristate "Xen virtual block device support"
--      depends on XEN
++      depends on PARAVIRT_XEN
         default y
         select XEN_XENBUS_FRONTEND
         help
diff --cc drivers/block/Makefile

index 76646e9,40528ba..4f69878
--- 1/drivers/block/Makefile
--- 2/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@@ -35,8 -35,7 +35,7 @@@ obj-$(CONFIG_BLK_DEV_SX8)     += sx8.
   obj-$(CONFIG_BLK_DEV_UB)      += ub.o
   obj-$(CONFIG_BLK_DEV_HD)      += hd.o
   
--obj-$(CONFIG_XEN_BLKDEV_FRONTEND)     += xen-blkfront.o
- obj-$(CONFIG_XEN_BLKDEV_BACKEND)      += xen-blkback/
++obj-$(CONFIG_PARAVIRT_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
   obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
   obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
   
diff --cc drivers/block/floppy.c

index 98de8f4,db8f885..137b608
--- 1/drivers/block/floppy.c
--- 2/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@@ -146,7 -146,7 +146,9 @@@
   
   #undef  FLOPPY_SILENT_DCL_CLEAR
   
++#ifndef CONFIG_XEN
   #define REALLY_SLOW_IO
++#endif
   
   #define DEBUGT 2
   
diff --cc drivers/block/xen-blkfront.c

index b536a9c,9cb8668..0ef48ac
--- 1/drivers/block/xen-blkfront.c
--- 2/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@@ -1374,7 -1351,7 +1351,6 @@@ static const struct xenbus_device_id bl
   
   static struct xenbus_driver blkfront = {
         .name = "vbd",
--      .owner = THIS_MODULE,
         .ids = blkfront_ids,
         .probe = blkfront_probe,
         .remove = blkfront_remove,
diff --cc drivers/cdrom/Makefile

index ecf85fd,ecf85fd..99757fb
--- 1/drivers/cdrom/Makefile
--- 2/drivers/cdrom/Makefile
+++ b/drivers/cdrom/Makefile
@@@ -9,6 -9,6 +9,7 @@@ obj-$(CONFIG_BLK_DEV_IDECD)      +
   obj-$(CONFIG_BLK_DEV_SR)      +=              cdrom.o
   obj-$(CONFIG_PARIDE_PCD)      +=              cdrom.o
   obj-$(CONFIG_CDROM_PKTCDVD)   +=              cdrom.o
++obj-$(CONFIG_XEN_BLKDEV_FRONTEND)     +=              cdrom.o
   
   obj-$(CONFIG_VIOCD)           += viocd.o      cdrom.o
   obj-$(CONFIG_GDROM)           += gdrom.o      cdrom.o
diff --cc drivers/char/Kconfig

index fad25a7,ad59b4e..947f22a
--- 1/drivers/char/Kconfig
--- 2/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@@ -534,7 -534,7 +534,7 @@@ config MAX_RAW_DEV
   config HPET
         bool "HPET - High Precision Event Timer" if (X86 || IA64)
         default n
--      depends on ACPI
++      depends on ACPI && !XEN
         help
           If you say Y here, you will have a miscdevice named "/dev/hpet/".  Each
           open selects one of the timers supported by the HPET.  The timers are
diff --cc drivers/char/agp/agp.h

index 923f99d,923f99d..e3491f5
--- 1/drivers/char/agp/agp.h
--- 2/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@@ -31,6 -31,6 +31,10 @@@
   
   #include <asm/agp.h>  /* for flush_agp_cache() */
   
++#ifndef virt_to_gart
++#define virt_to_gart virt_to_phys
++#endif
++
   #define PFX "agpgart: "
   
   //#define AGP_DEBUG 1
diff --cc drivers/char/agp/amd-k7-agp.c

index f7e8878,f7e8878..7e630bd
--- 1/drivers/char/agp/amd-k7-agp.c
--- 2/drivers/char/agp/amd-k7-agp.c
+++ b/drivers/char/agp/amd-k7-agp.c
@@@ -142,7 -142,7 +142,7 @@@ static int amd_create_gatt_table(struc
   
         agp_bridge->gatt_table_real = (u32 *)page_dir.real;
         agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped;
--      agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
++      agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
   
         /* Get the address for the gart region.
          * This is a bus address even on the alpha, b/c its
@@@ -155,7 -155,7 +155,7 @@@
   
         /* Calculate the agp offset */
         for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) {
--              writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1,
++              writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1,
                         page_dir.remapped+GET_PAGE_DIR_OFF(addr));
                 readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr));        /* PCI Posting. */
         }
diff --cc drivers/char/agp/amd64-agp.c

index 780498d,780498d..79900cd
--- 1/drivers/char/agp/amd64-agp.c
--- 2/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@@ -178,7 -178,7 +178,7 @@@ static const struct aper_size_info_32 a
   
   static int amd_8151_configure(void)
   {
--      unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
++      unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real);
         int i;
   
         if (!amd_nb_has_feature(AMD_NB_GART))
@@@ -583,7 -583,7 +583,7 @@@ static void __devexit agp_amd64_remove(
   {
         struct agp_bridge_data *bridge = pci_get_drvdata(pdev);
   
--      release_mem_region(virt_to_phys(bridge->gatt_table_real),
++      release_mem_region(virt_to_gart(bridge->gatt_table_real),
                            amd64_aperture_sizes[bridge->aperture_size_idx].size);
         agp_remove_bridge(bridge);
         agp_put_bridge(bridge);
diff --cc drivers/char/agp/ati-agp.c

index dc30e22,dc30e22..5c16d72
--- 1/drivers/char/agp/ati-agp.c
--- 2/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@@ -361,7 -361,7 +361,7 @@@ static int ati_create_gatt_table(struc
   
         agp_bridge->gatt_table_real = (u32 *)page_dir.real;
         agp_bridge->gatt_table = (u32 __iomem *) page_dir.remapped;
--      agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
++      agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
   
         /* Write out the size register */
         current_size = A_SIZE_LVL2(agp_bridge->current_size);
@@@ -391,7 -391,7 +391,7 @@@
   
         /* Calculate the agp offset */
         for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) {
--              writel(virt_to_phys(ati_generic_private.gatt_pages[i]->real) | 1,
++              writel(virt_to_gart(ati_generic_private.gatt_pages[i]->real) | 1,
                         page_dir.remapped+GET_PAGE_DIR_OFF(addr));
                 readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr));        /* PCI Posting. */
         }
diff --cc drivers/char/agp/efficeon-agp.c

index d607f53,d607f53..cb4fceb
--- 1/drivers/char/agp/efficeon-agp.c
--- 2/drivers/char/agp/efficeon-agp.c
+++ b/drivers/char/agp/efficeon-agp.c
@@@ -227,7 -227,7 +227,7 @@@ static int efficeon_create_gatt_table(s
   
                 efficeon_private.l1_table[index] = page;
   
--              value = virt_to_phys((unsigned long *)page) | pati | present | index;
++              value = virt_to_gart((unsigned long *)page) | pati | present | index;
   
                 pci_write_config_dword(agp_bridge->dev,
                         EFFICEON_ATTPAGE, value);
diff --cc drivers/char/agp/generic.c

index b072648,b072648..aa4ad95
--- 1/drivers/char/agp/generic.c
--- 2/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@@ -960,7 -960,7 +960,7 @@@ int agp_generic_create_gatt_table(struc
   
         bridge->gatt_table = (void *)table;
   #else
--      bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
++      bridge->gatt_table = ioremap_nocache(virt_to_gart(table),
                                         (PAGE_SIZE * (1 << page_order)));
         bridge->driver->cache_flush();
   #endif
@@@ -973,7 -973,7 +973,7 @@@
   
                 return -ENOMEM;
         }
--      bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real);
++      bridge->gatt_bus_addr = virt_to_gart(bridge->gatt_table_real);
   
         /* AK: bogus, should encode addresses > 4GB */
         for (i = 0; i < num_entries; i++) {
@@@ -1228,7 -1228,7 +1228,7 @@@ int agp_generic_alloc_pages(struct agp_
         }
   
   #ifdef CONFIG_X86
--      set_pages_array_uc(mem->pages, num_pages);
++      map_pages_into_agp(mem->pages, num_pages);
   #endif
         ret = 0;
   out:
@@@ -1261,7 -1261,7 +1261,7 @@@ void agp_generic_destroy_pages(struct a
                 return;
   
   #ifdef CONFIG_X86
--      set_pages_array_wb(mem->pages, mem->page_count);
++      unmap_pages_from_agp(mem->pages, mem->page_count);
   #endif
   
         for (i = 0; i < mem->page_count; i++) {
diff --cc drivers/char/agp/intel-gtt.c

index 8515101,0d09b53..bcb8506
--- 1/drivers/char/agp/intel-gtt.c
--- 2/drivers/char/agp/intel-gtt.c
+++ b/drivers/char/agp/intel-gtt.c
@@@ -147,8 -147,8 +147,19 @@@ static struct page *i8xx_alloc_pages(vo
         if (page == NULL)
                 return NULL;
   
++#ifdef CONFIG_XEN
++      if (xen_create_contiguous_region((unsigned long)page_address(page), 2, 32)) {
++              __free_pages(page, 2);
++              return NULL;
++      }
++#endif
++
         if (set_pages_uc(page, 4) < 0) {
                 set_pages_wb(page, 4);
++#ifdef CONFIG_XEN
++              xen_destroy_contiguous_region((unsigned long)page_address(page),
++                                            2);
++#endif
                 __free_pages(page, 2);
                 return NULL;
         }
@@@ -163,6 -163,6 +174,9 @@@ static void i8xx_destroy_pages(struct p
                 return;
   
         set_pages_wb(page, 4);
++#ifdef CONFIG_XEN
++      xen_destroy_contiguous_region((unsigned long)page_address(page), 2);
++#endif
         put_page(page);
         __free_pages(page, 2);
         atomic_dec(&agp_bridge->current_memory_agp);
@@@ -268,7 -268,7 +282,11 @@@ static struct agp_memory *alloc_agpphys
         new->page_count = pg_count;
         new->num_scratch_pages = pg_count;
         new->type = AGP_PHYS_MEMORY;
++#ifndef CONFIG_XEN
         new->physical = page_to_phys(new->pages[0]);
++#else
++      new->physical = page_to_pseudophys(new->pages[0]);
++#endif
         return new;
   }
   
diff --cc drivers/char/agp/sworks-agp.c

index f02f9b0,f02f9b0..ff98d27
--- 1/drivers/char/agp/sworks-agp.c
--- 2/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@@ -155,7 -155,7 +155,7 @@@ static int serverworks_create_gatt_tabl
         /* Create a fake scratch directory */
         for (i = 0; i < 1024; i++) {
                 writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i);
--              writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i);
++              writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i);
         }
   
         retval = serverworks_create_gatt_pages(value->num_entries / 1024);
@@@ -167,7 -167,7 +167,7 @@@
   
         agp_bridge->gatt_table_real = (u32 *)page_dir.real;
         agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped;
--      agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
++      agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
   
         /* Get the address for the gart region.
          * This is a bus address even on the alpha, b/c its
@@@ -179,7 -179,7 +179,7 @@@
   
         /* Calculate the agp offset */
         for (i = 0; i < value->num_entries / 1024; i++)
--              writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i);
++              writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i);
   
         return 0;
   }
diff --cc drivers/char/mem.c

index 8fc04b4,436a990..6b724e7
--- 1/drivers/char/mem.c
--- 2/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@@ -86,6 -86,6 +86,7 @@@ void __weak unxlate_dev_mem_ptr(unsigne
   {
   }
   
++#ifndef ARCH_HAS_DEV_MEM
   /*
    * This funcion reads the *physical* memory. The f_pos points directly to the
    * memory location.
@@@ -208,6 -208,6 +209,7 @@@ static ssize_t write_mem(struct file *f
         *ppos += written;
         return written;
   }
++#endif
   
   int __weak phys_mem_access_prot_allowed(struct file *file,
         unsigned long pfn, unsigned long size, pgprot_t *vma_prot)
@@@ -334,6 -334,6 +336,9 @@@ static int mmap_mem(struct file *file, 
   static int mmap_kmem(struct file *file, struct vm_area_struct *vma)
   {
         unsigned long pfn;
++#ifdef CONFIG_XEN
++      unsigned long i, count;
++#endif
   
         /* Turn a kernel-virtual address into a physical page frame */
         pfn = __pa((u64)vma->vm_pgoff << PAGE_SHIFT) >> PAGE_SHIFT;
@@@ -348,6 -348,6 +353,13 @@@
         if (!pfn_valid(pfn))
                 return -EIO;
   
++#ifdef CONFIG_XEN
++      count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++      for (i = 0; i < count; i++)
++              if ((pfn + i) != mfn_to_local_pfn(pfn_to_mfn(pfn + i)))
++                      return -EIO;
++#endif
++
         vma->vm_pgoff = pfn;
         return mmap_mem(file, vma);
   }
@@@ -739,6 -739,6 +751,7 @@@ static int open_port(struct inode * ino
   #define open_kmem     open_mem
   #define open_oldmem   open_mem
   
++#ifndef ARCH_HAS_DEV_MEM
   static const struct file_operations mem_fops = {
         .llseek         = memory_lseek,
         .read           = read_mem,
@@@ -747,6 -747,6 +760,9 @@@
         .open           = open_mem,
         .get_unmapped_area = get_unmapped_area_mem,
   };
++#else
++extern const struct file_operations mem_fops;
++#endif
   
   #ifdef CONFIG_DEVKMEM
   static const struct file_operations kmem_fops = {
diff --cc drivers/char/raw.c

index b33e8ea,b4b9d5a..6f9db62
--- 1/drivers/char/raw.c
--- 2/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@@ -337,8 -317,9 +337,9 @@@ static int __init raw_init(void
                 goto error;
   
         cdev_init(&raw_cdev, &raw_fops);
- -      ret = cdev_add(&raw_cdev, dev, MAX_RAW_MINORS);
+ +      ret = cdev_add(&raw_cdev, dev, max_raw_minors);
         if (ret) {
+               kobject_put(&raw_cdev.kobj);
                 goto error_region;
         }
   
diff --cc drivers/char/tpm/Kconfig

index f6595ab,f6595ab..ce2f9e8
--- 1/drivers/char/tpm/Kconfig
--- 2/drivers/char/tpm/Kconfig
+++ b/drivers/char/tpm/Kconfig
@@@ -60,4 -60,4 +60,13 @@@ config TCG_INFINEO
           Further information on this driver and the supported hardware
           can be found at http://www.trust.rub.de/projects/linux-device-driver-infineon-tpm/ 
   
++config TCG_XEN
++      tristate "XEN TPM Interface"
++      depends on XEN
++      ---help---
++        If you want to make TPM support available to a Xen user domain,
++        say Yes and it will be accessible from within Linux.
++        To compile this driver as a module, choose M here; the module
++        will be called tpm_xenu.
++
   endif # TCG_TPM
diff --cc drivers/char/tpm/Makefile

index ea3a1e0,ea3a1e0..b5cea0a
--- 1/drivers/char/tpm/Makefile
--- 2/drivers/char/tpm/Makefile
+++ b/drivers/char/tpm/Makefile
@@@ -9,3 -9,3 +9,5 @@@ obj-$(CONFIG_TCG_TIS) += tpm_tis.
   obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
   obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
   obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
++obj-$(CONFIG_TCG_XEN) += tpm_xenu.o
++tpm_xenu-y = tpm_xen.o tpm_vtpm.o
diff --cc drivers/char/tpm/tpm.h

index 72ddb03,72ddb03..aa61c11
--- 1/drivers/char/tpm/tpm.h
--- 2/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@@ -108,6 -108,6 +108,9 @@@ struct tpm_chip 
         struct dentry **bios_dir;
   
         struct list_head list;
++#ifdef CONFIG_XEN
++      void *priv;
++#endif
         void (*release) (struct device *);
   };
   
@@@ -272,6 -272,6 +275,18 @@@ struct tpm_cmd_t 
   
   ssize_t       tpm_getcap(struct device *, __be32, cap_t *, const char *);
   
++#ifdef CONFIG_XEN
++static inline void *chip_get_private(const struct tpm_chip *chip)
++{
++      return chip->priv;
++}
++
++static inline void chip_set_private(struct tpm_chip *chip, void *priv)
++{
++      chip->priv = priv;
++}
++#endif
++
   extern void tpm_get_timeouts(struct tpm_chip *);
   extern void tpm_gen_interrupt(struct tpm_chip *);
   extern void tpm_continue_selftest(struct tpm_chip *);
diff --cc drivers/char/tpm/tpm_vtpm.c

index 0000000,0000000..4b865f4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/char/tpm/tpm_vtpm.c
@@@ -1,0 -1,0 +1,543 @@@
++/*
++ * Copyright (C) 2006 IBM Corporation
++ *
++ * Authors:
++ * Stefan Berger <stefanb@us.ibm.com>
++ *
++ * Generic device driver part for device drivers in a virtualized
++ * environment.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License as
++ * published by the Free Software Foundation, version 2 of the
++ * License.
++ *
++ */
++
++#include <asm/uaccess.h>
++#include <linux/list.h>
++#include <linux/slab.h>
++#include <linux/device.h>
++#include <linux/interrupt.h>
++#include <linux/platform_device.h>
++#include "tpm.h"
++#include "tpm_vtpm.h"
++
++/* read status bits */
++enum {
++      STATUS_BUSY = 0x01,
++      STATUS_DATA_AVAIL = 0x02,
++      STATUS_READY = 0x04
++};
++
++struct transmission {
++      struct list_head next;
++
++      unsigned char *request;
++      size_t  request_len;
++      size_t  request_buflen;
++
++      unsigned char *response;
++      size_t  response_len;
++      size_t  response_buflen;
++
++      unsigned int flags;
++};
++
++enum {
++      TRANSMISSION_FLAG_WAS_QUEUED = 0x1
++};
++
++
++enum {
++      DATAEX_FLAG_QUEUED_ONLY = 0x1
++};
++
++
++/* local variables */
++
++/* local function prototypes */
++static int _vtpm_send_queued(struct tpm_chip *chip);
++
++
++/* =============================================================
++ * Some utility functions
++ * =============================================================
++ */
++static void vtpm_state_init(struct vtpm_state *vtpms)
++{
++      vtpms->current_request = NULL;
++      spin_lock_init(&vtpms->req_list_lock);
++      init_waitqueue_head(&vtpms->req_wait_queue);
++      INIT_LIST_HEAD(&vtpms->queued_requests);
++
++      vtpms->current_response = NULL;
++      spin_lock_init(&vtpms->resp_list_lock);
++      init_waitqueue_head(&vtpms->resp_wait_queue);
++
++      vtpms->disconnect_time = jiffies;
++}
++
++
++static inline struct transmission *transmission_alloc(void)
++{
++      return kzalloc(sizeof(struct transmission), GFP_ATOMIC);
++}
++
++static unsigned char *
++transmission_set_req_buffer(struct transmission *t,
++                            unsigned char *buffer, size_t len)
++{
++      if (t->request_buflen < len) {
++              kfree(t->request);
++              t->request = kmalloc(len, GFP_KERNEL);
++              if (!t->request) {
++                      t->request_buflen = 0;
++                      return NULL;
++              }
++              t->request_buflen = len;
++      }
++
++      memcpy(t->request, buffer, len);
++      t->request_len = len;
++
++      return t->request;
++}
++
++static unsigned char *
++transmission_set_res_buffer(struct transmission *t,
++                            const unsigned char *buffer, size_t len)
++{
++      if (t->response_buflen < len) {
++              kfree(t->response);
++              t->response = kmalloc(len, GFP_ATOMIC);
++              if (!t->response) {
++                      t->response_buflen = 0;
++                      return NULL;
++              }
++              t->response_buflen = len;
++      }
++
++      memcpy(t->response, buffer, len);
++      t->response_len = len;
++
++      return t->response;
++}
++
++static inline void transmission_free(struct transmission *t)
++{
++      kfree(t->request);
++      kfree(t->response);
++      kfree(t);
++}
++
++/* =============================================================
++ * Interface with the lower layer driver
++ * =============================================================
++ */
++/*
++ * Lower layer uses this function to make a response available.
++ */
++int vtpm_vd_recv(const struct tpm_chip *chip,
++                 const unsigned char *buffer, size_t count,
++                 void *ptr)
++{
++      unsigned long flags;
++      int ret_size = 0;
++      struct transmission *t;
++      struct vtpm_state *vtpms;
++
++      vtpms = (struct vtpm_state *)chip_get_private(chip);
++
++      /*
++       * The list with requests must contain one request
++       * only and the element there must be the one that
++       * was passed to me from the front-end.
++       */
++      spin_lock_irqsave(&vtpms->resp_list_lock, flags);
++      if (vtpms->current_request != ptr) {
++              spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
++              return 0;
++      }
++
++      if ((t = vtpms->current_request)) {
++              transmission_free(t);
++              vtpms->current_request = NULL;
++      }
++
++      t = transmission_alloc();
++      if (t) {
++              if (!transmission_set_res_buffer(t, buffer, count)) {
++                      transmission_free(t);
++                      spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
++                      return -ENOMEM;
++              }
++              ret_size = count;
++              vtpms->current_response = t;
++              wake_up_interruptible(&vtpms->resp_wait_queue);
++      }
++      spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
++
++      return ret_size;
++}
++
++
++/*
++ * Lower layer indicates its status (connected/disconnected)
++ */
++void vtpm_vd_status(const struct tpm_chip *chip, u8 vd_status)
++{
++      struct vtpm_state *vtpms;
++
++      vtpms = (struct vtpm_state *)chip_get_private(chip);
++
++      vtpms->vd_status = vd_status;
++      if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
++              vtpms->disconnect_time = jiffies;
++      }
++}
++
++/* =============================================================
++ * Interface with the generic TPM driver
++ * =============================================================
++ */
++static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
++{
++      int rc = 0;
++      unsigned long flags;
++      struct vtpm_state *vtpms;
++
++      vtpms = (struct vtpm_state *)chip_get_private(chip);
++
++      /*
++       * Check if the previous operation only queued the command
++       * In this case there won't be a response, so I just
++       * return from here and reset that flag. In any other
++       * case I should receive a response from the back-end.
++       */
++      spin_lock_irqsave(&vtpms->resp_list_lock, flags);
++      if ((vtpms->flags & DATAEX_FLAG_QUEUED_ONLY) != 0) {
++              vtpms->flags &= ~DATAEX_FLAG_QUEUED_ONLY;
++              spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
++              /*
++               * The first few commands (measurements) must be
++               * queued since it might not be possible to talk to the
++               * TPM, yet.
++               * Return a response of up to 30 '0's.
++               */
++
++              count = min_t(size_t, count, 30);
++              memset(buf, 0x0, count);
++              return count;
++      }
++      /*
++       * Check whether something is in the responselist and if
++       * there's nothing in the list wait for something to appear.
++       */
++
++      if (!vtpms->current_response) {
++              spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
++              interruptible_sleep_on_timeout(&vtpms->resp_wait_queue,
++                                             1000);
++              spin_lock_irqsave(&vtpms->resp_list_lock ,flags);
++      }
++
++      if (vtpms->current_response) {
++              struct transmission *t = vtpms->current_response;
++              vtpms->current_response = NULL;
++              rc = min(count, t->response_len);
++              memcpy(buf, t->response, rc);
++              transmission_free(t);
++      }
++
++      spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
++      return rc;
++}
++
++static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count)
++{
++      int rc = 0;
++      unsigned long flags;
++      struct transmission *t = transmission_alloc();
++      struct vtpm_state *vtpms;
++
++      vtpms = (struct vtpm_state *)chip_get_private(chip);
++
++      if (!t)
++              return -ENOMEM;
++      /*
++       * If there's a current request, it must be the
++       * previous request that has timed out.
++       */
++      spin_lock_irqsave(&vtpms->req_list_lock, flags);
++      if (vtpms->current_request != NULL) {
++              printk("WARNING: Sending although there is a request outstanding.\n"
++                     "         Previous request must have timed out.\n");
++              transmission_free(vtpms->current_request);
++              vtpms->current_request = NULL;
++      }
++      spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
++
++      /*
++       * Queue the packet if the driver below is not
++       * ready, yet, or there is any packet already
++       * in the queue.
++       * If the driver below is ready, unqueue all
++       * packets first before sending our current
++       * packet.
++       * For each unqueued packet, except for the
++       * last (=current) packet, call the function
++       * tpm_xen_recv to wait for the response to come
++       * back.
++       */
++      if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
++              if (time_after(jiffies,
++                             vtpms->disconnect_time + HZ * 10)) {
++                      rc = -ENOENT;
++              } else {
++                      goto queue_it;
++              }
++      } else {
++              /*
++               * Send all queued packets.
++               */
++              if (_vtpm_send_queued(chip) == 0) {
++
++                      vtpms->current_request = t;
++
++                      rc = vtpm_vd_send(vtpms->tpm_private,
++                                        buf,
++                                        count,
++                                        t);
++                      /*
++                       * The generic TPM driver will call
++                       * the function to receive the response.
++                       */
++                      if (rc < 0) {
++                              vtpms->current_request = NULL;
++                              goto queue_it;
++                      }
++              } else {
++queue_it:
++                      if (!transmission_set_req_buffer(t, buf, count)) {
++                              transmission_free(t);
++                              rc = -ENOMEM;
++                              goto exit;
++                      }
++                      /*
++                       * An error occurred. Don't event try
++                       * to send the current request. Just
++                       * queue it.
++                       */
++                      spin_lock_irqsave(&vtpms->req_list_lock, flags);
++                      vtpms->flags |= DATAEX_FLAG_QUEUED_ONLY;
++                      list_add_tail(&t->next, &vtpms->queued_requests);
++                      spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
++              }
++      }
++
++exit:
++      return rc;
++}
++
++
++/*
++ * Send all queued requests.
++ */
++static int _vtpm_send_queued(struct tpm_chip *chip)
++{
++      int rc;
++      int error = 0;
++      unsigned long flags;
++      unsigned char buffer[1];
++      struct vtpm_state *vtpms;
++      vtpms = (struct vtpm_state *)chip_get_private(chip);
++
++      spin_lock_irqsave(&vtpms->req_list_lock, flags);
++
++      while (!list_empty(&vtpms->queued_requests)) {
++              /*
++               * Need to dequeue them.
++               * Read the result into a dummy buffer.
++               */
++              struct transmission *qt = (struct transmission *)
++                                        vtpms->queued_requests.next;
++              list_del(&qt->next);
++              vtpms->current_request = qt;
++              spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
++
++              rc = vtpm_vd_send(vtpms->tpm_private,
++                                qt->request,
++                                qt->request_len,
++                                qt);
++
++              if (rc < 0) {
++                      spin_lock_irqsave(&vtpms->req_list_lock, flags);
++                      if ((qt = vtpms->current_request) != NULL) {
++                              /*
++                               * requeue it at the beginning
++                               * of the list
++                               */
++                              list_add(&qt->next,
++                                       &vtpms->queued_requests);
++                      }
++                      vtpms->current_request = NULL;
++                      error = 1;
++                      break;
++              }
++              /*
++               * After this point qt is not valid anymore!
++               * It is freed when the front-end is delivering
++               * the data by calling tpm_recv
++               */
++              /*
++               * Receive response into provided dummy buffer
++               */
++              rc = vtpm_recv(chip, buffer, sizeof(buffer));
++              spin_lock_irqsave(&vtpms->req_list_lock, flags);
++      }
++
++      spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
++
++      return error;
++}
++
++static void vtpm_cancel(struct tpm_chip *chip)
++{
++      unsigned long flags;
++      struct vtpm_state *vtpms = (struct vtpm_state *)chip_get_private(chip);
++
++      spin_lock_irqsave(&vtpms->resp_list_lock,flags);
++
++      if (!vtpms->current_response && vtpms->current_request) {
++              spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
++              interruptible_sleep_on(&vtpms->resp_wait_queue);
++              spin_lock_irqsave(&vtpms->resp_list_lock,flags);
++      }
++
++      if (vtpms->current_response) {
++              struct transmission *t = vtpms->current_response;
++              vtpms->current_response = NULL;
++              transmission_free(t);
++      }
++
++      spin_unlock_irqrestore(&vtpms->resp_list_lock,flags);
++}
++
++static u8 vtpm_status(struct tpm_chip *chip)
++{
++      u8 rc = 0;
++      unsigned long flags;
++      struct vtpm_state *vtpms;
++
++      vtpms = (struct vtpm_state *)chip_get_private(chip);
++
++      spin_lock_irqsave(&vtpms->resp_list_lock, flags);
++      /*
++       * Data are available if:
++       *  - there's a current response
++       *  - the last packet was queued only (this is fake, but necessary to
++       *      get the generic TPM layer to call the receive function.)
++       */
++      if (vtpms->current_response ||
++          0 != (vtpms->flags & DATAEX_FLAG_QUEUED_ONLY)) {
++              rc = STATUS_DATA_AVAIL;
++      } else if (!vtpms->current_response && !vtpms->current_request) {
++              rc = STATUS_READY;
++      }
++
++      spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
++      return rc;
++}
++
++static struct file_operations vtpm_ops = {
++      .owner = THIS_MODULE,
++      .llseek = no_llseek,
++      .open = tpm_open,
++      .read = tpm_read,
++      .write = tpm_write,
++      .release = tpm_release,
++};
++
++static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
++static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
++static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
++static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
++static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
++static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
++                 NULL);
++static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
++static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel);
++
++static struct attribute *vtpm_attrs[] = {
++      &dev_attr_pubek.attr,
++      &dev_attr_pcrs.attr,
++      &dev_attr_enabled.attr,
++      &dev_attr_active.attr,
++      &dev_attr_owned.attr,
++      &dev_attr_temp_deactivated.attr,
++      &dev_attr_caps.attr,
++      &dev_attr_cancel.attr,
++      NULL,
++};
++
++static struct attribute_group vtpm_attr_grp = { .attrs = vtpm_attrs };
++
++#define TPM_LONG_TIMEOUT   (10 * 60 * HZ)
++
++static struct tpm_vendor_specific tpm_vtpm = {
++      .recv = vtpm_recv,
++      .send = vtpm_send,
++      .cancel = vtpm_cancel,
++      .status = vtpm_status,
++      .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL,
++      .req_complete_val  = STATUS_DATA_AVAIL,
++      .req_canceled = STATUS_READY,
++      .attr_group = &vtpm_attr_grp,
++      .miscdev = {
++              .fops = &vtpm_ops,
++      },
++      .duration = {
++              TPM_LONG_TIMEOUT,
++              TPM_LONG_TIMEOUT,
++              TPM_LONG_TIMEOUT,
++      },
++};
++
++struct tpm_chip *init_vtpm(struct device *dev,
++                           struct tpm_private *tp)
++{
++      long rc;
++      struct tpm_chip *chip;
++      struct vtpm_state *vtpms;
++
++      vtpms = kzalloc(sizeof(struct vtpm_state), GFP_KERNEL);
++      if (!vtpms)
++              return ERR_PTR(-ENOMEM);
++
++      vtpm_state_init(vtpms);
++      vtpms->tpm_private = tp;
++
++      chip = tpm_register_hardware(dev, &tpm_vtpm);
++      if (!chip) {
++              rc = -ENODEV;
++              goto err_free_mem;
++      }
++
++      chip_set_private(chip, vtpms);
++
++      return chip;
++
++err_free_mem:
++      kfree(vtpms);
++
++      return ERR_PTR(rc);
++}
++
++void cleanup_vtpm(struct device *dev)
++{
++      struct tpm_chip *chip = dev_get_drvdata(dev);
++      struct vtpm_state *vtpms = (struct vtpm_state*)chip_get_private(chip);
++      tpm_remove_hardware(dev);
++      kfree(vtpms);
++}
diff --cc drivers/char/tpm/tpm_vtpm.h

index 0000000,0000000..77aa342

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/char/tpm/tpm_vtpm.h
@@@ -1,0 -1,0 +1,55 @@@
++#ifndef TPM_VTPM_H
++#define TPM_VTPM_H
++
++struct tpm_chip;
++struct tpm_private;
++
++struct vtpm_state {
++      struct transmission *current_request;
++      spinlock_t           req_list_lock;
++      wait_queue_head_t    req_wait_queue;
++
++      struct list_head     queued_requests;
++
++      struct transmission *current_response;
++      spinlock_t           resp_list_lock;
++      wait_queue_head_t    resp_wait_queue;     // processes waiting for responses
++
++      u8                   vd_status;
++      u8                   flags;
++
++      unsigned long        disconnect_time;
++
++      /*
++       * The following is a private structure of the underlying
++       * driver. It is passed as parameter in the send function.
++       */
++      struct tpm_private *tpm_private;
++};
++
++
++enum vdev_status {
++      TPM_VD_STATUS_DISCONNECTED = 0x0,
++      TPM_VD_STATUS_CONNECTED = 0x1
++};
++
++/* this function is called from tpm_vtpm.c */
++int vtpm_vd_send(struct tpm_private * tp,
++                 const u8 * buf, size_t count, void *ptr);
++
++/* these functions are offered by tpm_vtpm.c */
++struct tpm_chip *init_vtpm(struct device *,
++                           struct tpm_private *);
++void cleanup_vtpm(struct device *);
++int vtpm_vd_recv(const struct tpm_chip* chip,
++                 const unsigned char *buffer, size_t count, void *ptr);
++void vtpm_vd_status(const struct tpm_chip *, u8 status);
++
++static inline struct tpm_private *tpm_private_from_dev(struct device *dev)
++{
++      struct tpm_chip *chip = dev_get_drvdata(dev);
++      struct vtpm_state *vtpms = chip_get_private(chip);
++      return vtpms->tpm_private;
++}
++
++#endif
diff --cc drivers/char/tpm/tpm_xen.c

index 0000000,0000000..4939cb8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/char/tpm/tpm_xen.c
@@@ -1,0 -1,0 +1,721 @@@
++/*
++ * Copyright (c) 2005, IBM Corporation
++ *
++ * Author: Stefan Berger, stefanb@us.ibm.com
++ * Grant table support: Mahadevan Gomathisankaran
++ *
++ * This code has been derived from drivers/xen/netfront/netfront.c
++ *
++ * Copyright (c) 2002-2004, K A Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/errno.h>
++#include <linux/err.h>
++#include <linux/interrupt.h>
++#include <linux/mutex.h>
++#include <linux/slab.h>
++#include <asm/uaccess.h>
++#include <xen/evtchn.h>
++#include <xen/interface/grant_table.h>
++#include <xen/interface/io/tpmif.h>
++#include <xen/gnttab.h>
++#include <xen/xenbus.h>
++#include "tpm.h"
++#include "tpm_vtpm.h"
++
++#undef DEBUG
++
++/* local structures */
++struct tpm_private {
++      struct tpm_chip *chip;
++
++      tpmif_tx_interface_t *tx;
++      atomic_t refcnt;
++      unsigned int irq;
++      u8 is_connected;
++      u8 is_suspended;
++
++      spinlock_t tx_lock;
++
++      struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE];
++
++      atomic_t tx_busy;
++      void *tx_remember;
++
++      domid_t backend_id;
++      wait_queue_head_t wait_q;
++
++      struct xenbus_device *dev;
++      int ring_ref;
++};
++
++struct tx_buffer {
++      unsigned int size;      // available space in data
++      unsigned int len;       // used space in data
++      unsigned char *data;    // pointer to a page
++};
++
++
++/* locally visible variables */
++static grant_ref_t gref_head;
++static struct tpm_private *my_priv;
++
++/* local function prototypes */
++static irqreturn_t tpmif_int(int irq,
++                             void *tpm_priv);
++static void tpmif_rx_action(unsigned long unused);
++static int tpmif_connect(struct xenbus_device *dev,
++                         struct tpm_private *tp,
++                         domid_t domid);
++static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0);
++static int tpmif_allocate_tx_buffers(struct tpm_private *tp);
++static void tpmif_free_tx_buffers(struct tpm_private *tp);
++static void tpmif_set_connected_state(struct tpm_private *tp,
++                                      u8 newstate);
++static int tpm_xmit(struct tpm_private *tp,
++                    const u8 * buf, size_t count, int userbuffer,
++                    void *remember);
++static void destroy_tpmring(struct tpm_private *tp);
++void __exit tpmif_exit(void);
++
++#define DPRINTK(fmt, args...) \
++    pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args)
++#define IPRINTK(fmt, args...) \
++    pr_info("xen_tpm_fr: " fmt, ##args)
++#define WPRINTK(fmt, args...) \
++    pr_warning("xen_tpm_fr: " fmt, ##args)
++
++#define GRANT_INVALID_REF     0
++
++
++static inline int
++tx_buffer_copy(struct tx_buffer *txb, const u8 *src, int len,
++               int isuserbuffer)
++{
++      int copied = len;
++
++      if (len > txb->size)
++              copied = txb->size;
++      if (isuserbuffer) {
++              if (copy_from_user(txb->data, src, copied))
++                      return -EFAULT;
++      } else {
++              memcpy(txb->data, src, copied);
++      }
++      txb->len = len;
++      return copied;
++}
++
++static inline struct tx_buffer *tx_buffer_alloc(void)
++{
++      struct tx_buffer *txb;
++
++      txb = kzalloc(sizeof(struct tx_buffer), GFP_KERNEL);
++      if (!txb)
++              return NULL;
++
++      txb->len = 0;
++      txb->size = PAGE_SIZE;
++      txb->data = (unsigned char *)__get_free_page(GFP_KERNEL);
++      if (txb->data == NULL) {
++              kfree(txb);
++              txb = NULL;
++      }
++
++      return txb;
++}
++
++
++static inline void tx_buffer_free(struct tx_buffer *txb)
++{
++      if (txb) {
++              free_page((long)txb->data);
++              kfree(txb);
++      }
++}
++
++/**************************************************************
++ Utility function for the tpm_private structure
++**************************************************************/
++static void tpm_private_init(struct tpm_private *tp)
++{
++      spin_lock_init(&tp->tx_lock);
++      init_waitqueue_head(&tp->wait_q);
++      atomic_set(&tp->refcnt, 1);
++}
++
++static void tpm_private_put(void)
++{
++      if (!atomic_dec_and_test(&my_priv->refcnt))
++              return;
++
++      tpmif_free_tx_buffers(my_priv);
++      kfree(my_priv);
++      my_priv = NULL;
++}
++
++static struct tpm_private *tpm_private_get(void)
++{
++      int err;
++
++      if (my_priv) {
++              atomic_inc(&my_priv->refcnt);
++              return my_priv;
++      }
++
++      my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL);
++      if (!my_priv)
++              return NULL;
++
++      tpm_private_init(my_priv);
++      err = tpmif_allocate_tx_buffers(my_priv);
++      if (err < 0)
++              tpm_private_put();
++
++      return my_priv;
++}
++
++/**************************************************************
++
++ The interface to let the tpm plugin register its callback
++ function and send data to another partition using this module
++
++**************************************************************/
++
++static DEFINE_MUTEX(suspend_lock);
++/*
++ * Send data via this module by calling this function
++ */
++int vtpm_vd_send(struct tpm_private *tp,
++                 const u8 * buf, size_t count, void *ptr)
++{
++      int sent;
++
++      mutex_lock(&suspend_lock);
++      sent = tpm_xmit(tp, buf, count, 0, ptr);
++      mutex_unlock(&suspend_lock);
++
++      return sent;
++}
++
++/**************************************************************
++ XENBUS support code
++**************************************************************/
++
++static int setup_tpmring(struct xenbus_device *dev,
++                         struct tpm_private *tp)
++{
++      tpmif_tx_interface_t *sring;
++      int err;
++
++      tp->ring_ref = GRANT_INVALID_REF;
++
++      sring = (void *)__get_free_page(GFP_KERNEL);
++      if (!sring) {
++              xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
++              return -ENOMEM;
++      }
++      tp->tx = sring;
++
++      err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx));
++      if (err < 0) {
++              free_page((unsigned long)sring);
++              tp->tx = NULL;
++              xenbus_dev_fatal(dev, err, "allocating grant reference");
++              goto fail;
++      }
++      tp->ring_ref = err;
++
++      err = tpmif_connect(dev, tp, dev->otherend_id);
++      if (err)
++              goto fail;
++
++      return 0;
++fail:
++      destroy_tpmring(tp);
++      return err;
++}
++
++
++static void destroy_tpmring(struct tpm_private *tp)
++{
++      tpmif_set_connected_state(tp, 0);
++
++      if (tp->ring_ref != GRANT_INVALID_REF) {
++              gnttab_end_foreign_access(tp->ring_ref, (unsigned long)tp->tx);
++              tp->ring_ref = GRANT_INVALID_REF;
++              tp->tx = NULL;
++      }
++
++      if (tp->irq)
++              unbind_from_irqhandler(tp->irq, tp);
++
++      tp->irq = 0;
++}
++
++
++static int talk_to_backend(struct xenbus_device *dev,
++                           struct tpm_private *tp)
++{
++      const char *message = NULL;
++      int err;
++      struct xenbus_transaction xbt;
++
++      err = setup_tpmring(dev, tp);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "setting up ring");
++              goto out;
++      }
++
++again:
++      err = xenbus_transaction_start(&xbt);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "starting transaction");
++              goto destroy_tpmring;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename,
++                          "ring-ref","%u", tp->ring_ref);
++      if (err) {
++              message = "writing ring-ref";
++              goto abort_transaction;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
++                          irq_to_evtchn_port(tp->irq));
++      if (err) {
++              message = "writing event-channel";
++              goto abort_transaction;
++      }
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err == -EAGAIN)
++              goto again;
++      if (err) {
++              xenbus_dev_fatal(dev, err, "completing transaction");
++              goto destroy_tpmring;
++      }
++
++      xenbus_switch_state(dev, XenbusStateConnected);
++
++      return 0;
++
++abort_transaction:
++      xenbus_transaction_end(xbt, 1);
++      if (message)
++              xenbus_dev_error(dev, err, "%s", message);
++destroy_tpmring:
++      destroy_tpmring(tp);
++out:
++      return err;
++}
++
++/**
++ * Callback received when the backend's state changes.
++ */
++static void backend_changed(struct xenbus_device *dev,
++                          enum xenbus_state backend_state)
++{
++      struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
++      DPRINTK("\n");
++
++      switch (backend_state) {
++      case XenbusStateInitialising:
++      case XenbusStateInitWait:
++      case XenbusStateInitialised:
++      case XenbusStateReconfiguring:
++      case XenbusStateReconfigured:
++      case XenbusStateUnknown:
++              break;
++
++      case XenbusStateConnected:
++              tpmif_set_connected_state(tp, 1);
++              break;
++
++      case XenbusStateClosing:
++              tpmif_set_connected_state(tp, 0);
++              xenbus_frontend_closed(dev);
++              break;
++
++      case XenbusStateClosed:
++              tpmif_set_connected_state(tp, 0);
++              if (tp->is_suspended == 0)
++                      device_unregister(&dev->dev);
++              xenbus_frontend_closed(dev);
++              break;
++      }
++}
++
++static int tpmfront_probe(struct xenbus_device *dev,
++                          const struct xenbus_device_id *id)
++{
++      int err;
++      int handle;
++      struct tpm_private *tp = tpm_private_get();
++
++      if (!tp)
++              return -ENOMEM;
++
++      tp->chip = init_vtpm(&dev->dev, tp);
++      if (IS_ERR(tp->chip))
++              return PTR_ERR(tp->chip);
++
++      err = xenbus_scanf(XBT_NIL, dev->nodename,
++                         "handle", "%i", &handle);
++      if (XENBUS_EXIST_ERR(err))
++              return err;
++
++      if (err < 0) {
++              xenbus_dev_fatal(dev,err,"reading virtual-device");
++              return err;
++      }
++
++      tp->dev = dev;
++
++      err = talk_to_backend(dev, tp);
++      if (err) {
++              tpm_private_put();
++              return err;
++      }
++
++      return 0;
++}
++
++
++static int tpmfront_remove(struct xenbus_device *dev)
++{
++      struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
++      destroy_tpmring(tp);
++      cleanup_vtpm(&dev->dev);
++      return 0;
++}
++
++static int tpmfront_suspend(struct xenbus_device *dev)
++{
++      struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
++      u32 ctr;
++
++      /* Take the lock, preventing any application from sending. */
++      mutex_lock(&suspend_lock);
++      tp->is_suspended = 1;
++
++      for (ctr = 0; atomic_read(&tp->tx_busy); ctr++) {
++              if ((ctr % 10) == 0)
++                      printk("TPM-FE [INFO]: Waiting for outstanding "
++                             "request.\n");
++              /* Wait for a request to be responded to. */
++              interruptible_sleep_on_timeout(&tp->wait_q, 100);
++      }
++
++      return 0;
++}
++
++static int tpmfront_suspend_finish(struct tpm_private *tp)
++{
++      tp->is_suspended = 0;
++      /* Allow applications to send again. */
++      mutex_unlock(&suspend_lock);
++      return 0;
++}
++
++static int tpmfront_suspend_cancel(struct xenbus_device *dev)
++{
++      struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
++      return tpmfront_suspend_finish(tp);
++}
++
++static int tpmfront_resume(struct xenbus_device *dev)
++{
++      struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
++      destroy_tpmring(tp);
++      return talk_to_backend(dev, tp);
++}
++
++static int tpmif_connect(struct xenbus_device *dev,
++                         struct tpm_private *tp,
++                         domid_t domid)
++{
++      int err;
++
++      tp->backend_id = domid;
++
++      err = bind_listening_port_to_irqhandler(
++              domid, tpmif_int, IRQF_SAMPLE_RANDOM, "tpmif", tp);
++      if (err <= 0) {
++              WPRINTK("bind_listening_port_to_irqhandler failed "
++                      "(err=%d)\n", err);
++              return err;
++      }
++      tp->irq = err;
++
++      return 0;
++}
++
++static struct xenbus_device_id tpmfront_ids[] = {
++      { "vtpm" },
++      { "" }
++};
++
++static struct xenbus_driver tpmfront = {
++      .name = "vtpm",
++      .ids = tpmfront_ids,
++      .probe = tpmfront_probe,
++      .remove =  tpmfront_remove,
++      .resume = tpmfront_resume,
++      .otherend_changed = backend_changed,
++      .suspend = tpmfront_suspend,
++      .suspend_cancel = tpmfront_suspend_cancel,
++};
++
++static int __init init_tpm_xenbus(void)
++{
++      return xenbus_register_frontend(&tpmfront);
++}
++
++static int tpmif_allocate_tx_buffers(struct tpm_private *tp)
++{
++      unsigned int i;
++
++      for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
++              tp->tx_buffers[i] = tx_buffer_alloc();
++              if (!tp->tx_buffers[i]) {
++                      tpmif_free_tx_buffers(tp);
++                      return -ENOMEM;
++              }
++      }
++      return 0;
++}
++
++static void tpmif_free_tx_buffers(struct tpm_private *tp)
++{
++      unsigned int i;
++
++      for (i = 0; i < TPMIF_TX_RING_SIZE; i++)
++              tx_buffer_free(tp->tx_buffers[i]);
++}
++
++static void tpmif_rx_action(unsigned long priv)
++{
++      struct tpm_private *tp = (struct tpm_private *)priv;
++      int i = 0;
++      unsigned int received;
++      unsigned int offset = 0;
++      u8 *buffer;
++      tpmif_tx_request_t *tx = &tp->tx->ring[i].req;
++
++      atomic_set(&tp->tx_busy, 0);
++      wake_up_interruptible(&tp->wait_q);
++
++      received = tx->size;
++
++      buffer = kmalloc(received, GFP_ATOMIC);
++      if (!buffer)
++              return;
++
++      for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) {
++              struct tx_buffer *txb = tp->tx_buffers[i];
++              tpmif_tx_request_t *tx;
++              unsigned int tocopy;
++
++              tx = &tp->tx->ring[i].req;
++              tocopy = tx->size;
++              if (tocopy > PAGE_SIZE)
++                      tocopy = PAGE_SIZE;
++
++              memcpy(&buffer[offset], txb->data, tocopy);
++
++              gnttab_release_grant_reference(&gref_head, tx->ref);
++
++              offset += tocopy;
++      }
++
++      vtpm_vd_recv(tp->chip, buffer, received, tp->tx_remember);
++      kfree(buffer);
++}
++
++
++static irqreturn_t tpmif_int(int irq, void *tpm_priv)
++{
++      struct tpm_private *tp = tpm_priv;
++      unsigned long flags;
++
++      spin_lock_irqsave(&tp->tx_lock, flags);
++      tpmif_rx_tasklet.data = (unsigned long)tp;
++      tasklet_schedule(&tpmif_rx_tasklet);
++      spin_unlock_irqrestore(&tp->tx_lock, flags);
++
++      return IRQ_HANDLED;
++}
++
++
++static int tpm_xmit(struct tpm_private *tp,
++                    const u8 * buf, size_t count, int isuserbuffer,
++                    void *remember)
++{
++      tpmif_tx_request_t *tx;
++      TPMIF_RING_IDX i;
++      unsigned int offset = 0;
++
++      spin_lock_irq(&tp->tx_lock);
++
++      if (unlikely(atomic_read(&tp->tx_busy))) {
++              printk("tpm_xmit: There's an outstanding request/response "
++                     "on the way!\n");
++              spin_unlock_irq(&tp->tx_lock);
++              return -EBUSY;
++      }
++
++      if (tp->is_connected != 1) {
++              spin_unlock_irq(&tp->tx_lock);
++              return -EIO;
++      }
++
++      for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) {
++              struct tx_buffer *txb = tp->tx_buffers[i];
++              int copied;
++
++              if (!txb) {
++                      DPRINTK("txb (i=%d) is NULL. buffers initilized?\n"
++                              "Not transmitting anything!\n", i);
++                      spin_unlock_irq(&tp->tx_lock);
++                      return -EFAULT;
++              }
++
++              copied = tx_buffer_copy(txb, &buf[offset], count,
++                                      isuserbuffer);
++              if (copied < 0) {
++                      /* An error occurred */
++                      spin_unlock_irq(&tp->tx_lock);
++                      return copied;
++              }
++              count -= copied;
++              offset += copied;
++
++              tx = &tp->tx->ring[i].req;
++              tx->addr = virt_to_machine(txb->data);
++              tx->size = txb->len;
++              tx->unused = 0;
++
++              DPRINTK("First 4 characters sent by TPM-FE are "
++                      "0x%02x 0x%02x 0x%02x 0x%02x\n",
++                      txb->data[0],txb->data[1],txb->data[2],txb->data[3]);
++
++              /* Get the granttable reference for this page. */
++              tx->ref = gnttab_claim_grant_reference(&gref_head);
++              if (tx->ref == -ENOSPC) {
++                      spin_unlock_irq(&tp->tx_lock);
++                      DPRINTK("Grant table claim reference failed in "
++                              "func:%s line:%d file:%s\n",
++                              __FUNCTION__, __LINE__, __FILE__);
++                      return -ENOSPC;
++              }
++              gnttab_grant_foreign_access_ref(tx->ref,
++                                              tp->backend_id,
++                                              virt_to_mfn(txb->data),
++                                              0 /*RW*/);
++              wmb();
++      }
++
++      atomic_set(&tp->tx_busy, 1);
++      tp->tx_remember = remember;
++
++      mb();
++
++      notify_remote_via_irq(tp->irq);
++
++      spin_unlock_irq(&tp->tx_lock);
++      return offset;
++}
++
++
++static void tpmif_notify_upperlayer(struct tpm_private *tp)
++{
++      /* Notify upper layer about the state of the connection to the BE. */
++      vtpm_vd_status(tp->chip, (tp->is_connected
++                                ? TPM_VD_STATUS_CONNECTED
++                                : TPM_VD_STATUS_DISCONNECTED));
++}
++
++
++static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected)
++{
++      /*
++       * Don't notify upper layer if we are in suspend mode and
++       * should disconnect - assumption is that we will resume
++       * The mutex keeps apps from sending.
++       */
++      if (is_connected == 0 && tp->is_suspended == 1)
++              return;
++
++      /*
++       * Unlock the mutex if we are connected again
++       * after being suspended - now resuming.
++       * This also removes the suspend state.
++       */
++      if (is_connected == 1 && tp->is_suspended == 1)
++              tpmfront_suspend_finish(tp);
++
++      if (is_connected != tp->is_connected) {
++              tp->is_connected = is_connected;
++              tpmif_notify_upperlayer(tp);
++      }
++}
++
++
++
++/* =================================================================
++ * Initialization function.
++ * =================================================================
++ */
++
++
++static int __init tpmif_init(void)
++{
++      struct tpm_private *tp;
++
++      if (is_initial_xendomain())
++              return -EPERM;
++
++      tp = tpm_private_get();
++      if (!tp)
++              return -ENOMEM;
++
++      IPRINTK("Initialising the vTPM driver.\n");
++      if (gnttab_alloc_grant_references(TPMIF_TX_RING_SIZE,
++                                        &gref_head) < 0) {
++              tpm_private_put();
++              return -EFAULT;
++      }
++
++      init_tpm_xenbus();
++      return 0;
++}
++
++
++module_init(tpmif_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/cpufreq/Kconfig

index 9fb8485,ca8ee80..a98e7f3
--- 1/drivers/cpufreq/Kconfig
--- 2/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@@ -1,7 -1,5 +1,6 @@@
- menu "CPU Frequency scaling"
- 
   config CPU_FREQ
         bool "CPU Frequency scaling"
++      depends on !PROCESSOR_EXTERNAL_CONTROL
         help
           CPU Frequency scaling allows you to change the clock speed of 
           CPUs on the fly. This is a nice method to save power, because 
diff --cc drivers/cpuidle/Kconfig

index 7dbc4a8,7dbc4a8..5f1f297
--- 1/drivers/cpuidle/Kconfig
--- 2/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@@ -1,6 -1,6 +1,7 @@@
   
   config CPU_IDLE
         bool "CPU idle PM support"
++      depends on !PROCESSOR_EXTERNAL_CONTROL
         default ACPI
         help
           CPU idle is a generic framework for supporting software-controlled
diff --cc drivers/dma/Kconfig

index 25cf327,a572600..5ef506a
--- 1/drivers/dma/Kconfig
--- 2/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@@ -61,7 -61,7 +61,7 @@@ config INTEL_IOATDM
         tristate "Intel I/OAT DMA support"
         depends on PCI && X86
         select DMA_ENGINE
--      select DCA
++      select DCA if !XEN
         select ASYNC_TX_DISABLE_PQ_VAL_DMA
         select ASYNC_TX_DISABLE_XOR_VAL_DMA
         help
diff --cc drivers/dma/ioat/Makefile

index 0ff7270,0ff7270..495983a
--- 1/drivers/dma/ioat/Makefile
--- 2/drivers/dma/ioat/Makefile
+++ b/drivers/dma/ioat/Makefile
@@@ -1,2 -1,2 +1,3 @@@
   obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
--ioatdma-y := pci.o dma.o dma_v2.o dma_v3.o dca.o
++dca-$(CONFIG_DCA) := dca.o
++ioatdma-y := pci.o dma.o dma_v2.o dma_v3.o $(dca-y) $(dca-m)
diff --cc drivers/dma/ioat/dca.c

index abd9038,abd9038..fb188d1
--- 1/drivers/dma/ioat/dca.c
--- 2/drivers/dma/ioat/dca.c
+++ b/drivers/dma/ioat/dca.c
@@@ -682,3 -682,3 +682,15 @@@ ioat3_dca_init(struct pci_dev *pdev, vo
   
         return dca;
   }
++
++void ioat_remove_dca_provider(struct pci_dev *pdev)
++{
++      struct ioatdma_device *device = pci_get_drvdata(pdev);
++
++      if (!device->dca)
++              return;
++
++      unregister_dca_provider(device->dca, &pdev->dev);
++      free_dca_provider(device->dca);
++      device->dca = NULL;
++}
diff --cc drivers/dma/ioat/dma.h

index 5216c8a,5216c8a..6a8f223
--- 1/drivers/dma/ioat/dma.h
--- 2/drivers/dma/ioat/dma.h
+++ b/drivers/dma/ioat/dma.h
@@@ -347,4 -347,4 +347,21 @@@ void ioat_kobject_del(struct ioatdma_de
   extern const struct sysfs_ops ioat_sysfs_ops;
   extern struct ioat_sysfs_entry ioat_version_attr;
   extern struct ioat_sysfs_entry ioat_cap_attr;
++
++#ifndef CONFIG_XEN
++void ioat_remove_dca_provider(struct pci_dev *);
++#else
++static inline void ioat_remove_dca_provider(struct pci_dev *pdev)
++{
++      struct ioatdma_device *device = pci_get_drvdata(pdev);
++      BUG_ON(device->dca);
++}
++static inline struct dca_provider *__devinit
++__ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase)
++{
++      return NULL;
++}
++#define ioat_dca_init __ioat_dca_init
++#endif
++
   #endif /* IOATDMA_H */
diff --cc drivers/dma/ioat/dma_v2.h

index a2c413b,a2c413b..1a61a42
--- 1/drivers/dma/ioat/dma_v2.h
--- 2/drivers/dma/ioat/dma_v2.h
+++ b/drivers/dma/ioat/dma_v2.h
@@@ -176,4 -176,4 +176,10 @@@ int ioat2_quiesce(struct ioat_chan_comm
   int ioat2_reset_sync(struct ioat_chan_common *chan, unsigned long tmo);
   extern struct kobj_type ioat2_ktype;
   extern struct kmem_cache *ioat2_cache;
++
++#ifdef CONFIG_XEN
++#define ioat2_dca_init __ioat_dca_init
++#define ioat3_dca_init __ioat_dca_init
++#endif
++
   #endif /* IOATDMA_V2_H */
diff --cc drivers/dma/ioat/hw.h

index 60e6754,60e6754..3d0dca6
--- 1/drivers/dma/ioat/hw.h
--- 2/drivers/dma/ioat/hw.h
+++ b/drivers/dma/ioat/hw.h
@@@ -39,7 -39,7 +39,11 @@@
   #define IOAT_VER_3_0            0x30    /* Version 3.0 */
   #define IOAT_VER_3_2            0x32    /* Version 3.2 */
   
++#ifndef CONFIG_XEN
   int system_has_dca_enabled(struct pci_dev *pdev);
++#else
++static inline int system_has_dca_enabled(struct pci_dev *pdev) { return 0; }
++#endif
   
   struct ioat_dma_descriptor {
         uint32_t        size;
diff --cc drivers/dma/ioat/pci.c

index fab37d1,fab37d1..a74f381
--- 1/drivers/dma/ioat/pci.c
--- 2/drivers/dma/ioat/pci.c
+++ b/drivers/dma/ioat/pci.c
@@@ -29,7 -29,7 +29,6 @@@
   #include <linux/module.h>
   #include <linux/pci.h>
   #include <linux/interrupt.h>
--#include <linux/dca.h>
   #include <linux/slab.h>
   #include "dma.h"
   #include "dma_v2.h"
@@@ -170,11 -170,11 +169,7 @@@ static void __devexit ioat_remove(struc
                 return;
   
         dev_err(&pdev->dev, "Removing dma and dca services\n");
--      if (device->dca) {
--              unregister_dca_provider(device->dca, &pdev->dev);
--              free_dca_provider(device->dca);
--              device->dca = NULL;
--      }
++      ioat_remove_dca_provider(pdev);
         ioat_dma_remove(device);
   }
   
diff --cc drivers/edac/Kconfig

index af1a17d,af1a17d..6a8d6f4
--- 1/drivers/edac/Kconfig
--- 2/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@@ -77,6 -77,6 +77,7 @@@ config EDAC_MC
   config EDAC_AMD64
         tristate "AMD64 (Opteron, Athlon64) K8, F10h"
         depends on EDAC_MM_EDAC && AMD_NB && X86_64 && EDAC_DECODE_MCE
++      depends on !XEN
         help
           Support for error detection and correction of DRAM ECC errors on
           the AMD64 families of memory controllers (K8 and F10h)
diff --cc drivers/edac/edac_mc.c

index d69144a,1d80560..eaaaa9d
--- 1/drivers/edac/edac_mc.c
--- 2/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@@ -611,6 -615,6 +615,10 @@@ static void edac_mc_scrub_block(unsigne
   
         debugf3("%s()\n", __func__);
   
++#ifdef CONFIG_XEN
++      page = mfn_to_local_pfn(page);
++#endif
++
         /* ECC error page was not in our memory. Ignore it. */
         if (!pfn_valid(page))
                 return;
diff --cc drivers/edac/i7core_edac.c

index 04f1e7c,465cbc2..f6da81e
--- 1/drivers/edac/i7core_edac.c
--- 2/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@@ -1842,8 -1842,8 +1842,11 @@@ static int i7core_mce_check_error(void 
         if (mce->bank != 8)
                 return 0;
   
--#ifdef CONFIG_SMP
         /* Only handle if it is the right mc controller */
++#if defined(CONFIG_XEN) /* Could easily be used for native too. */
++      if (mce->socketid != pvt->i7core_dev->socket)
++              return 0;
++#elif defined(CONFIG_SMP)
         if (cpu_data(mce->cpu).phys_proc_id != pvt->i7core_dev->socket)
                 return 0;
   #endif
diff --cc drivers/firmware/Kconfig

index efba163,b3a25a5..0ee0d99
--- 1/drivers/firmware/Kconfig
--- 2/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@@ -91,6 -91,6 +91,7 @@@ config DELL_RB
   config DCDBAS
         tristate "Dell Systems Management Base Driver"
         depends on X86
++      select XEN_DOMCTL if XEN
         help
           The Dell Systems Management Base Driver provides a sysfs interface
           for systems management software to perform System Management
diff --cc drivers/firmware/dcdbas.c

index ea5ac2d,ea5ac2d..156a75d
--- 1/drivers/firmware/dcdbas.c
--- 2/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@@ -37,6 -37,6 +37,10 @@@
   #include <linux/mutex.h>
   #include <asm/io.h>
   
++#ifdef CONFIG_XEN
++#include "../xen/core/domctl.h"
++#endif
++
   #include "dcdbas.h"
   
   #define DRIVER_NAME           "dcdbas"
@@@ -107,7 -107,7 +111,7 @@@ static int smi_data_buf_realloc(unsigne
         /* set up new buffer for use */
         smi_data_buf = buf;
         smi_data_buf_handle = handle;
--      smi_data_buf_phys_addr = (u32) virt_to_phys(buf);
++      smi_data_buf_phys_addr = (u32) handle;
         smi_data_buf_size = size;
   
         dev_dbg(&dcdbas_pdev->dev, "%s: phys: %x size: %lu\n",
@@@ -245,7 -245,7 +249,9 @@@ static ssize_t host_control_on_shutdown
    */
   int dcdbas_smi_request(struct smi_cmd *smi_cmd)
   {
++#ifndef CONFIG_XEN
         cpumask_var_t old_mask;
++#endif
         int ret = 0;
   
         if (smi_cmd->magic != SMI_CMD_MAGIC) {
@@@ -255,6 -255,6 +261,7 @@@
         }
   
         /* SMI requires CPU 0 */
++#ifndef CONFIG_XEN
         if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
                 return -ENOMEM;
   
@@@ -266,6 -266,6 +273,14 @@@
                 ret = -EBUSY;
                 goto out;
         }
++#else
++      ret = xen_set_physical_cpu_affinity(0);
++      if (ret) {
++              dev_dbg(&dcdbas_pdev->dev, "%s: failed (%d) to get CPU 0\n",
++                      __func__, ret);
++              return ret;
++      }
++#endif
   
         /* generate SMI */
         /* inb to force posted write through and make SMI happen now */
@@@ -280,9 -280,9 +295,13 @@@
                 : "memory"
         );
   
++#ifndef CONFIG_XEN
   out:
         set_cpus_allowed_ptr(current, old_mask);
         free_cpumask_var(old_mask);
++#else
++      xen_set_physical_cpu_affinity(-1);
++#endif
         return ret;
   }
   
@@@ -322,7 -322,7 +341,7 @@@ static ssize_t smi_request_store(struc
                 break;
         case 1:
                 /* Calling Interface SMI */
--              smi_cmd->ebx = (u32) virt_to_phys(smi_cmd->command_buffer);
++              smi_cmd->ebx = (u32) virt_to_bus(smi_cmd->command_buffer);
                 ret = dcdbas_smi_request(smi_cmd);
                 if (!ret)
                         ret = count;
@@@ -603,6 -603,6 +622,11 @@@ static int __init dcdbas_init(void
   {
         int error;
   
++#ifdef CONFIG_XEN
++      if (!is_initial_xendomain())
++              return -ENODEV;
++#endif
++
         error = platform_driver_register(&dcdbas_driver);
         if (error)
                 return error;
diff --cc drivers/firmware/dell_rbu.c

index 2f452f1,2f452f1..fb60177
--- 1/drivers/firmware/dell_rbu.c
--- 2/drivers/firmware/dell_rbu.c
+++ b/drivers/firmware/dell_rbu.c
@@@ -170,9 -170,9 +170,27 @@@ static int create_packet(void *data, si
                         spin_lock(&rbu_data.lock);
                         goto out_alloc_packet_array;
                 }
++#ifdef CONFIG_XEN
++              if (ordernum && xen_create_contiguous_region(
++                      (unsigned long)packet_data_temp_buf, ordernum, 0)) {
++                      free_pages((unsigned long)packet_data_temp_buf,
++                                 ordernum);
++                      pr_warning("dell_rbu:%s: failed to adjust new "
++                                 "packet\n", __func__);
++                      retval = -ENOMEM;
++                      spin_lock(&rbu_data.lock);
++                      goto out_alloc_packet_array;
++              }
++#endif
   
--              if ((unsigned long)virt_to_phys(packet_data_temp_buf)
++              if ((unsigned long)virt_to_bus(packet_data_temp_buf)
                                 < allocation_floor) {
++#ifdef CONFIG_XEN
++                      if (ordernum)
++                              xen_destroy_contiguous_region(
++                                      (unsigned long)packet_data_temp_buf,
++                                      ordernum);
++#endif
                         pr_debug("packet 0x%lx below floor at 0x%lx.\n",
                                         (unsigned long)virt_to_phys(
                                                 packet_data_temp_buf),
@@@ -186,7 -186,7 +204,7 @@@
         newpacket->data = packet_data_temp_buf;
   
         pr_debug("create_packet: newpacket at physical addr %lx\n",
--              (unsigned long)virt_to_phys(newpacket->data));
++              (unsigned long)virt_to_bus(newpacket->data));
   
         /* packets may not have fixed size */
         newpacket->length = length;
@@@ -205,7 -205,7 +223,7 @@@ out_alloc_packet_array
         /* always free packet array */
         for (;idx>0;idx--) {
                 pr_debug("freeing unused packet below floor 0x%lx.\n",
--                      (unsigned long)virt_to_phys(
++                      (unsigned long)virt_to_bus(
                                 invalid_addr_packet_array[idx-1]));
                 free_pages((unsigned long)invalid_addr_packet_array[idx-1],
                         ordernum);
@@@ -349,6 -349,6 +367,13 @@@ static void packet_empty_list(void
                  * to make sure there are no stale RBU packets left in memory
                  */
                 memset(newpacket->data, 0, rbu_data.packetsize);
++#ifdef CONFIG_XEN
++              if (newpacket->ordernum)
++                      xen_destroy_contiguous_region(
++                              (unsigned long)newpacket->data,
++                              newpacket->ordernum);
++#endif
++
                 free_pages((unsigned long) newpacket->data,
                         newpacket->ordernum);
                 kfree(newpacket);
@@@ -403,7 -403,7 +428,9 @@@ static int img_update_realloc(unsigned 
   {
         unsigned char *image_update_buffer = NULL;
         unsigned long rc;
++#ifndef CONFIG_XEN
         unsigned long img_buf_phys_addr;
++#endif
         int ordernum;
         int dma_alloc = 0;
   
@@@ -434,15 -434,15 +461,19 @@@
   
         spin_unlock(&rbu_data.lock);
   
++#ifndef CONFIG_XEN
         ordernum = get_order(size);
         image_update_buffer =
                 (unsigned char *) __get_free_pages(GFP_KERNEL, ordernum);
   
         img_buf_phys_addr =
--              (unsigned long) virt_to_phys(image_update_buffer);
++              (unsigned long) virt_to_bus(image_update_buffer);
   
         if (img_buf_phys_addr > BIOS_SCAN_LIMIT) {
                 free_pages((unsigned long) image_update_buffer, ordernum);
++#else
++      {
++#endif
                 ordernum = -1;
                 image_update_buffer = dma_alloc_coherent(NULL, size,
                         &dell_rbu_dmaaddr, GFP_KERNEL);
@@@ -695,6 -695,6 +726,12 @@@ static struct bin_attribute rbu_packet_
   static int __init dcdrbu_init(void)
   {
         int rc;
++
++#ifdef CONFIG_XEN
++      if (!is_initial_xendomain())
++              return -ENODEV;
++#endif
++
         spin_lock_init(&rbu_data.lock);
   
         init_packet_head();
diff --cc drivers/firmware/dmi_scan.c

index bcb1126,bcb1126..d0c8818
--- 1/drivers/firmware/dmi_scan.c
--- 2/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@@ -482,6 -482,6 +482,11 @@@ static bool dmi_matches(const struct dm
   {
         int i;
   
++#ifdef CONFIG_XEN
++      if (!is_initial_xendomain())
++              return false;
++#endif
++
         WARN(!dmi_initialized, KERN_ERR "dmi check: not initialized yet.\n");
   
         for (i = 0; i < ARRAY_SIZE(dmi->matches); i++) {
diff --cc drivers/gpu/drm/i915/i915_drv.c

index 0defd42,32d1b3e..23b9fc8
--- 1/drivers/gpu/drm/i915/i915_drv.c
--- 2/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@@ -789,7 -731,7 +731,7 @@@ static struct drm_driver driver = 
                  .open = drm_open,
                  .release = drm_release,
                  .unlocked_ioctl = drm_ioctl,
--               .mmap = drm_gem_mmap,
++               .mmap = i915_gem_mmap,
                  .poll = drm_poll,
                  .fasync = drm_fasync,
                  .read = drm_read,
diff --cc drivers/gpu/drm/i915/i915_drv.h

index ee66035,1c1b27c..5c10731
--- 1/drivers/gpu/drm/i915/i915_drv.h
--- 2/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@@ -1210,6 -1165,6 +1165,11 @@@ int __must_check i915_add_request(struc
                                   struct drm_i915_gem_request *request);
   int __must_check i915_wait_request(struct intel_ring_buffer *ring,
                                    uint32_t seqno);
++#ifdef CONFIG_XEN
++int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma);
++#else
++#define i915_gem_mmap drm_gem_mmap
++#endif
   int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
   int __must_check
   i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
diff --cc drivers/gpu/drm/i915/i915_gem.c

index 0b2e167,7ce3f35..d0619f4
--- 1/drivers/gpu/drm/i915/i915_gem.c
--- 2/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@@ -1172,6 -1174,6 +1174,17 @@@ i915_gem_mmap_ioctl(struct drm_device *
         return 0;
   }
   
++#ifdef CONFIG_XEN
++int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma)
++{
++      int ret = drm_gem_mmap(filp, vma);
++
++      pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP;
++
++      return ret;
++}
++#endif
++
   /**
    * i915_gem_fault - fault a page into the GTT
    * vma: VMA in question
diff --cc drivers/gpu/drm/radeon/radeon_device.c

index 5b61364,890217e..4480b23
--- 1/drivers/gpu/drm/radeon/radeon_device.c
--- 2/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@@ -431,6 -431,6 +431,18 @@@ int radeon_dummy_page_init(struct radeo
         rdev->dummy_page.page = alloc_page(GFP_DMA32 | GFP_KERNEL | __GFP_ZERO);
         if (rdev->dummy_page.page == NULL)
                 return -ENOMEM;
++#ifdef CONFIG_XEN
++      {
++              int ret = xen_limit_pages_to_max_mfn(rdev->dummy_page.page,
++                                                   0, 32);
++
++              if (!ret)
++                      clear_page(page_address(rdev->dummy_page.page));
++              else
++                      dev_warn(rdev->dev,
++                               "Error restricting dummy page: %d\n", ret);
++      }
++#endif
         rdev->dummy_page.addr = pci_map_page(rdev->pdev, rdev->dummy_page.page,
                                         0, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
         if (pci_dma_mapping_error(rdev->pdev, rdev->dummy_page.addr)) {
diff --cc drivers/gpu/drm/ttm/ttm_bo.c

index 2e618b5,2e618b5..f07b550
--- 1/drivers/gpu/drm/ttm/ttm_bo.c
--- 2/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@@ -1441,6 -1441,6 +1441,14 @@@ int ttm_bo_global_init(struct drm_globa
                 ret = -ENOMEM;
                 goto out_no_drp;
         }
++#ifdef CONFIG_XEN
++      ret = xen_limit_pages_to_max_mfn(glob->dummy_read_page, 0, 32);
++      if (!ret)
++              clear_page(page_address(glob->dummy_read_page));
++      else
++              printk(KERN_WARNING
++                     "Error restricting dummy read page: %d\n", ret);
++#endif
   
         INIT_LIST_HEAD(&glob->swap_lru);
         INIT_LIST_HEAD(&glob->device_list);
diff --cc drivers/gpu/drm/ttm/ttm_bo_vm.c

index 221b924,221b924..13ccb5a
--- 1/drivers/gpu/drm/ttm/ttm_bo_vm.c
--- 2/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@@ -169,7 -169,7 +169,13 @@@ static int ttm_bo_vm_fault(struct vm_ar
         if (bo->mem.bus.is_iomem) {
                 vma->vm_page_prot = ttm_io_prot(bo->mem.placement,
                                                 vma->vm_page_prot);
++#if defined(CONFIG_XEN) && defined(_PAGE_IOMAP)
++              pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP;
++#endif
         } else {
++#if defined(CONFIG_XEN) && defined(_PAGE_IOMAP)
++              pgprot_val(vma->vm_page_prot) &= ~_PAGE_IOMAP;
++#endif
                 ttm = bo->ttm;
                 vma->vm_page_prot = (bo->mem.placement & TTM_PL_FLAG_CACHED) ?
                     vm_get_page_prot(vma->vm_flags) :
diff --cc drivers/gpu/drm/ttm/ttm_page_alloc.c

index d948575,9d9d929..5d3c6b2
--- 1/drivers/gpu/drm/ttm/ttm_page_alloc.c
--- 2/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@@ -500,6 -498,6 +498,19 @@@ static int ttm_alloc_new_pages(struct l
         for (i = 0, cpages = 0; i < count; ++i) {
                 p = alloc_page(gfp_flags);
   
++#ifdef CONFIG_XEN
++              if (p && (gfp_flags & __GFP_DMA32)) {
++                      r = xen_limit_pages_to_max_mfn(p, 0, 32);
++                      if (r) {
++                              __free_page(p);
++                              printk(KERN_ERR TTM_PFX
++                                     "Cannot restrict page (%d).", r);
++                              p = NULL;
++                      } else if (gfp_flags & __GFP_ZERO)
++                              clear_page(page_address(p));
++              }
++#endif
++
                 if (!p) {
                         printk(KERN_ERR TTM_PFX "Unable to get page %u.\n", i);
   
diff --cc drivers/gpu/drm/vmwgfx/Kconfig

index 30ad133,30ad133..05d5e93
--- 1/drivers/gpu/drm/vmwgfx/Kconfig
--- 2/drivers/gpu/drm/vmwgfx/Kconfig
+++ b/drivers/gpu/drm/vmwgfx/Kconfig
@@@ -1,6 -1,6 +1,6 @@@
   config DRM_VMWGFX
         tristate "DRM driver for VMware Virtual GPU"
--      depends on DRM && PCI && FB
++      depends on DRM && PCI && FB && !XEN
         select FB_DEFERRED_IO
         select FB_CFB_FILLRECT
         select FB_CFB_COPYAREA
diff --cc drivers/hid/hid-core.c
Simple merge
diff --cc drivers/hid/hid-ids.h

index d8fe114,00a94b5..549e243
--- 1/drivers/hid/hid-ids.h
--- 2/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@@ -226,10 -220,7 +220,9 @@@
   #define USB_VENDOR_ID_DREAM_CHEEKY    0x1d34
   
   #define USB_VENDOR_ID_ELO             0x04E7
+ +#define USB_DEVICE_ID_ELO_4000U               0x0009
- #define USB_DEVICE_ID_ELO_TS2515      0x0022
   #define USB_DEVICE_ID_ELO_TS2700      0x0020
+ +#define USB_DEVICE_ID_ELO_4500U               0x0030
   
   #define USB_VENDOR_ID_EMS             0x2006
   #define USB_DEVICE_ID_EMS_TRIO_LINKER_PLUS_II 0x0118
diff --cc drivers/hwmon/Kconfig

index 16db83c,50e40db..aa26508
--- 1/drivers/hwmon/Kconfig
--- 2/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@@ -412,12 -402,19 +402,21 @@@ config SENSORS_GPIO_FA
   
   config SENSORS_CORETEMP
         tristate "Intel Core/Core2/Atom temperature sensor"
--      depends on X86 && PCI && EXPERIMENTAL
++      depends on X86 && PCI && !XEN_UNPRIVILEGED_GUEST && EXPERIMENTAL
++      select XEN_DOMCTL if XEN
         help
           If you say yes here you get support for the temperature
           sensor inside your CPU. Most of the family 6 CPUs
           are supported. Check Documentation/hwmon/coretemp for details.
   
+ config SENSORS_PKGTEMP
+       tristate "Intel processor package temperature sensor"
- -      depends on X86 && EXPERIMENTAL
++      depends on X86 && !XEN_UNPRIVILEGED_GUEST && EXPERIMENTAL
++      select XEN_DOMCTL if XEN
+       help
+         If you say yes here you get support for the package level temperature
+         sensor inside your CPU. Check documentation/driver for details.
+ 
   config SENSORS_IBMAEM
         tristate "IBM Active Energy Manager temperature/power sensors and control"
         select IPMI_SI
@@@ -1141,7 -1069,7 +1071,8 @@@ config SENSORS_TWL4030_MAD
   
   config SENSORS_VIA_CPUTEMP
         tristate "VIA CPU temperature sensor"
--      depends on X86
++      depends on X86 && !XEN_UNPRIVILEGED_GUEST
++      select XEN_DOMCTL if XEN
         help
           If you say yes here you get support for the temperature
           sensor inside your CPU. Supported are all known variants of
diff --cc drivers/hwmon/coretemp-xen.c

index 0000000,0000000..1e058c5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/hwmon/coretemp-xen.c
@@@ -1,0 -1,0 +1,623 @@@
++/*
++ * coretemp.c - Linux kernel module for hardware monitoring
++ *
++ * Copyright (C) 2007 Rudolf Marek <r.marek@assembler.cz>
++ *
++ * Inspired from many hwmon drivers
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; version 2 of the License.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
++ * 02110-1301 USA.
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/hwmon.h>
++#include <linux/sysfs.h>
++#include <linux/hwmon-sysfs.h>
++#include <linux/err.h>
++#include <linux/mutex.h>
++#include <linux/list.h>
++#include <linux/platform_device.h>
++#include <linux/pci.h>
++#include <asm/msr.h>
++#include <xen/pcpu.h>
++#include "../xen/core/domctl.h"
++
++#define DRVNAME       "coretemp"
++#define coretemp_data pdev_entry
++
++typedef enum { SHOW_TEMP, SHOW_TJMAX, SHOW_TTARGET, SHOW_LABEL,
++              SHOW_NAME } SHOW;
++
++/*
++ * Functions declaration
++ */
++
++static struct coretemp_data *coretemp_update_device(struct device *dev);
++
++struct pdev_entry {
++      struct list_head list;
++      struct platform_device *pdev;
++      struct device *hwmon_dev;
++      struct mutex update_lock;
++      const char *name;
++      u32 cpu_core_id, phys_proc_id;
++      u8 x86_model, x86_mask;
++      u32 ucode_rev;
++      char valid;             /* zero until following fields are valid */
++      unsigned long last_updated;     /* in jiffies */
++      int temp;
++      int tjmax;
++      int ttarget;
++      u8 alarm;
++};
++
++/*
++ * Sysfs stuff
++ */
++
++static ssize_t show_name(struct device *dev, struct device_attribute
++                        *devattr, char *buf)
++{
++      int ret;
++      struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
++      struct coretemp_data *data = dev_get_drvdata(dev);
++
++      if (attr->index == SHOW_NAME)
++              ret = sprintf(buf, "%s\n", data->name);
++      else    /* show label */
++              ret = sprintf(buf, "Core %d\n", data->cpu_core_id);
++      return ret;
++}
++
++static ssize_t show_alarm(struct device *dev, struct device_attribute
++                        *devattr, char *buf)
++{
++      struct coretemp_data *data = coretemp_update_device(dev);
++      /* read the Out-of-spec log, never clear */
++      return sprintf(buf, "%d\n", data->alarm);
++}
++
++static ssize_t show_temp(struct device *dev,
++                       struct device_attribute *devattr, char *buf)
++{
++      struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
++      struct coretemp_data *data = coretemp_update_device(dev);
++      int err;
++
++      if (attr->index == SHOW_TEMP)
++              err = data->valid ? sprintf(buf, "%d\n", data->temp) : -EAGAIN;
++      else if (attr->index == SHOW_TJMAX)
++              err = sprintf(buf, "%d\n", data->tjmax);
++      else
++              err = sprintf(buf, "%d\n", data->ttarget);
++      return err;
++}
++
++static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, show_temp, NULL,
++                        SHOW_TEMP);
++static SENSOR_DEVICE_ATTR(temp1_crit, S_IRUGO, show_temp, NULL,
++                        SHOW_TJMAX);
++static SENSOR_DEVICE_ATTR(temp1_max, S_IRUGO, show_temp, NULL,
++                        SHOW_TTARGET);
++static DEVICE_ATTR(temp1_crit_alarm, S_IRUGO, show_alarm, NULL);
++static SENSOR_DEVICE_ATTR(temp1_label, S_IRUGO, show_name, NULL, SHOW_LABEL);
++static SENSOR_DEVICE_ATTR(name, S_IRUGO, show_name, NULL, SHOW_NAME);
++
++static struct attribute *coretemp_attributes[] = {
++      &sensor_dev_attr_name.dev_attr.attr,
++      &sensor_dev_attr_temp1_label.dev_attr.attr,
++      &dev_attr_temp1_crit_alarm.attr,
++      &sensor_dev_attr_temp1_input.dev_attr.attr,
++      &sensor_dev_attr_temp1_crit.dev_attr.attr,
++      NULL
++};
++
++static const struct attribute_group coretemp_group = {
++      .attrs = coretemp_attributes,
++};
++
++static struct coretemp_data *coretemp_update_device(struct device *dev)
++{
++      struct coretemp_data *data = dev_get_drvdata(dev);
++
++      mutex_lock(&data->update_lock);
++
++      if (!data->valid || time_after(jiffies, data->last_updated + HZ)) {
++              u32 eax, edx;
++
++              data->valid = 0;
++              if (rdmsr_safe_on_pcpu(data->pdev->id, MSR_IA32_THERM_STATUS,
++                                     &eax, &edx) < 0)
++                      eax = ~0;
++              data->alarm = (eax >> 5) & 1;
++              /* update only if data has been valid */
++              if (eax & 0x80000000) {
++                      data->temp = data->tjmax - (((eax >> 16)
++                                                      & 0x7f) * 1000);
++                      data->valid = 1;
++              } else {
++                      dev_dbg(dev, "Temperature data invalid (0x%x)\n", eax);
++              }
++              data->last_updated = jiffies;
++      }
++
++      mutex_unlock(&data->update_lock);
++      return data;
++}
++
++static int adjust_tjmax(struct coretemp_data *c, u32 id, struct device *dev)
++{
++      /* The 100C is default for both mobile and non mobile CPUs */
++
++      int tjmax = 100000;
++      int tjmax_ee = 85000;
++      int usemsr_ee = 1;
++      int err;
++      u32 eax, edx;
++      struct pci_dev *host_bridge;
++
++      /* Early chips have no MSR for TjMax */
++
++      if ((c->x86_model == 0xf) && (c->x86_mask < 4)) {
++              usemsr_ee = 0;
++      }
++
++      /* Atom CPUs */
++
++      if (c->x86_model == 0x1c) {
++              usemsr_ee = 0;
++
++              host_bridge = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0));
++
++              if (host_bridge && host_bridge->vendor == PCI_VENDOR_ID_INTEL
++                  && (host_bridge->device == 0xa000   /* NM10 based nettop */
++                  || host_bridge->device == 0xa010))  /* NM10 based netbook */
++                      tjmax = 100000;
++              else
++                      tjmax = 90000;
++
++              pci_dev_put(host_bridge);
++      }
++
++      if ((c->x86_model > 0xe) && (usemsr_ee)) {
++              u8 platform_id;
++
++              /* Now we can detect the mobile CPU using Intel provided table
++                 http://softwarecommunity.intel.com/Wiki/Mobility/720.htm
++                 For Core2 cores, check MSR 0x17, bit 28 1 = Mobile CPU
++              */
++
++              err = rdmsr_safe_on_pcpu(id, 0x17, &eax, &edx);
++              if (err < 0) {
++                      dev_warn(dev,
++                               "Unable to access MSR 0x17, assuming desktop"
++                               " CPU\n");
++                      usemsr_ee = 0;
++              } else if (c->x86_model < 0x17 && !(eax & 0x10000000)) {
++                      /* Trust bit 28 up to Penryn, I could not find any
++                         documentation on that; if you happen to know
++                         someone at Intel please ask */
++                      usemsr_ee = 0;
++              } else {
++                      /* Platform ID bits 52:50 (EDX starts at bit 32) */
++                      platform_id = (edx >> 18) & 0x7;
++
++                      /* Mobile Penryn CPU seems to be platform ID 7 or 5
++                        (guesswork) */
++                      if ((c->x86_model == 0x17) &&
++                          ((platform_id == 5) || (platform_id == 7))) {
++                              /* If MSR EE bit is set, set it to 90 degrees C,
++                                 otherwise 105 degrees C */
++                              tjmax_ee = 90000;
++                              tjmax = 105000;
++                      }
++              }
++      }
++
++      if (usemsr_ee) {
++
++              err = rdmsr_safe_on_pcpu(id, 0xee, &eax, &edx);
++              if (err < 0) {
++                      dev_warn(dev,
++                               "Unable to access MSR 0xEE, for Tjmax, left"
++                               " at default\n");
++              } else if (eax & 0x40000000) {
++                      tjmax = tjmax_ee;
++              }
++      /* if we dont use msr EE it means we are desktop CPU (with exeception
++         of Atom) */
++      } else if (tjmax == 100000) {
++              dev_warn(dev, "Using relative temperature scale!\n");
++      }
++
++      return tjmax;
++}
++
++static int get_tjmax(struct coretemp_data *c, u32 id, struct device *dev)
++{
++      /* The 100C is default for both mobile and non mobile CPUs */
++      int err;
++      u32 eax, edx;
++      u32 val;
++
++      /* A new feature of current Intel(R) processors, the
++         IA32_TEMPERATURE_TARGET contains the TjMax value */
++      err = rdmsr_safe_on_pcpu(id, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
++      if (err < 0) {
++              dev_warn(dev, "Unable to read TjMax from CPU.\n");
++      } else {
++              val = (eax >> 16) & 0xff;
++              /*
++               * If the TjMax is not plausible, an assumption
++               * will be used
++               */
++              if ((val > 80) && (val < 120)) {
++                      dev_info(dev, "TjMax is %d C.\n", val);
++                      return val * 1000;
++              }
++      }
++
++      /*
++       * An assumption is made for early CPUs and unreadable MSR.
++       * NOTE: the given value may not be correct.
++       */
++
++      switch (c->x86_model) {
++      case 0xe:
++      case 0xf:
++      case 0x16:
++      case 0x1a:
++              dev_warn(dev, "TjMax is assumed as 100 C!\n");
++              return 100000;
++      case 0x17:
++      case 0x1c:              /* Atom CPUs */
++              return adjust_tjmax(c, id, dev);
++      default:
++              dev_warn(dev, "CPU (model=0x%x) is not supported yet,"
++                      " using default TjMax of 100C.\n", c->x86_model);
++              return 100000;
++      }
++}
++
++static int coretemp_probe(struct platform_device *pdev)
++{
++      struct coretemp_data *data = platform_get_drvdata(pdev);
++      int err;
++      u32 eax, edx;
++
++      data->name = "coretemp";
++      mutex_init(&data->update_lock);
++
++      /* test if we can access the THERM_STATUS MSR */
++      err = rdmsr_safe_on_pcpu(pdev->id, MSR_IA32_THERM_STATUS, &eax, &edx);
++      if (err < 0) {
++              dev_err(&pdev->dev,
++                      "Unable to access THERM_STATUS MSR, giving up\n");
++              return err;
++      }
++
++      /* Check if we have problem with errata AE18 of Core processors:
++         Readings might stop update when processor visited too deep sleep,
++         fixed for stepping D0 (6EC).
++      */
++
++      if ((data->x86_model == 0xe) && (data->x86_mask < 0xc)) {
++              /* check for microcode update */
++              if (!(data->ucode_rev + 1))
++                      dev_warn(&pdev->dev,
++                               "Cannot read microcode revision of CPU\n");
++              else if (data->ucode_rev < 0x39) {
++                      err = -ENODEV;
++                      dev_err(&pdev->dev,
++                              "Errata AE18 not fixed, update BIOS or "
++                              "microcode of the CPU!\n");
++                      return err;
++              }
++      }
++
++      data->tjmax = get_tjmax(data, pdev->id, &pdev->dev);
++
++      /*
++       * read the still undocumented IA32_TEMPERATURE_TARGET. It exists
++       * on older CPUs but not in this register,
++       * Atoms don't have it either.
++       */
++
++      if ((data->x86_model > 0xe) && (data->x86_model != 0x1c)) {
++              err = rdmsr_safe_on_pcpu(pdev->id, MSR_IA32_TEMPERATURE_TARGET,
++                                       &eax, &edx);
++              if (err < 0) {
++                      dev_warn(&pdev->dev, "Unable to read"
++                                      " IA32_TEMPERATURE_TARGET MSR\n");
++              } else {
++                      data->ttarget = data->tjmax -
++                                      (((eax >> 8) & 0xff) * 1000);
++                      err = device_create_file(&pdev->dev,
++                                      &sensor_dev_attr_temp1_max.dev_attr);
++                      if (err)
++                              return err;
++              }
++      }
++
++      if ((err = sysfs_create_group(&pdev->dev.kobj, &coretemp_group)))
++              goto exit_dev;
++
++      data->hwmon_dev = hwmon_device_register(&pdev->dev);
++      if (IS_ERR(data->hwmon_dev)) {
++              err = PTR_ERR(data->hwmon_dev);
++              dev_err(&pdev->dev, "Class registration failed (%d)\n",
++                      err);
++              goto exit_class;
++      }
++
++      return 0;
++
++exit_class:
++      sysfs_remove_group(&pdev->dev.kobj, &coretemp_group);
++exit_dev:
++      device_remove_file(&pdev->dev, &sensor_dev_attr_temp1_max.dev_attr);
++      return err;
++}
++
++static int coretemp_remove(struct platform_device *pdev)
++{
++      struct coretemp_data *data = platform_get_drvdata(pdev);
++
++      hwmon_device_unregister(data->hwmon_dev);
++      sysfs_remove_group(&pdev->dev.kobj, &coretemp_group);
++      device_remove_file(&pdev->dev, &sensor_dev_attr_temp1_max.dev_attr);
++      return 0;
++}
++
++static struct platform_driver coretemp_driver = {
++      .driver = {
++              .owner = THIS_MODULE,
++              .name = DRVNAME,
++      },
++      .probe = coretemp_probe,
++      .remove = coretemp_remove,
++};
++
++static LIST_HEAD(pdev_list);
++static DEFINE_MUTEX(pdev_list_mutex);
++
++struct cpu_info {
++      struct pdev_entry *pdev_entry;
++      u32 cpuid_6_eax;
++};
++
++static void get_cpuid_info(void *arg)
++{
++      struct cpu_info *info = arg;
++      struct pdev_entry *pdev_entry = info->pdev_entry;
++      u32 val = cpuid_eax(1);
++
++      pdev_entry->x86_model = ((val >> 4) & 0xf) | ((val >> 12) & 0xf0);
++      pdev_entry->x86_mask = val & 0xf;
++
++      if (((val >> 8) & 0xf) != 6 || ((val >> 20) & 0xff)
++          || !pdev_entry->x86_model
++          || wrmsr_safe(MSR_IA32_UCODE_REV, 0, 0) < 0
++          || (sync_core(), rdmsr_safe(MSR_IA32_UCODE_REV,
++                                      &val, &pdev_entry->ucode_rev)) < 0)
++              pdev_entry->ucode_rev = ~0;
++
++      info->cpuid_6_eax = cpuid_eax(0) >= 6 ? cpuid_eax(6) : 0;
++}
++
++static int coretemp_device_add(unsigned int cpu)
++{
++      int err;
++      struct cpu_info info;
++      struct platform_device *pdev;
++      struct pdev_entry *pdev_entry;
++
++      info.pdev_entry = kzalloc(sizeof(*pdev_entry), GFP_KERNEL);
++      if (!info.pdev_entry)
++              return -ENOMEM;
++
++      err = xen_set_physical_cpu_affinity(cpu);
++      if (!err) {
++              get_cpuid_info(&info);
++              WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
++      } else if (err > 0) {
++              static bool warned;
++
++              if (!warned) {
++                      warned = true;
++                      printk(KERN_WARNING DRVNAME
++                             "Cannot set physical CPU affinity"
++                             " (assuming use of dom0_vcpus_pin)\n");
++              }
++              err = smp_call_function_single(cpu, get_cpuid_info, &info, 1);
++      }
++      if (err)
++              goto exit_entry_free;
++
++      /*
++       * CPUID.06H.EAX[0] indicates whether the CPU has thermal
++       * sensors. We check this bit only, all the early CPUs
++       * without thermal sensors will be filtered out.
++       */
++      if (!(info.cpuid_6_eax & 0x1)) {
++              pr_info("CPU (model=0x%x) has no thermal sensor\n",
++                      info.pdev_entry->x86_model);
++              goto exit_entry_free;
++      }
++
++      err = xen_get_topology_info(cpu, &info.pdev_entry->cpu_core_id,
++                                  &info.pdev_entry->phys_proc_id, NULL);
++      if (err)
++              goto exit_entry_free;
++
++      mutex_lock(&pdev_list_mutex);
++
++      /* Skip second HT entry of each core */
++      list_for_each_entry(pdev_entry, &pdev_list, list) {
++              if (info.pdev_entry->phys_proc_id == pdev_entry->phys_proc_id &&
++                  info.pdev_entry->cpu_core_id == pdev_entry->cpu_core_id) {
++                      err = 0;        /* Not an error */
++                      goto exit;
++              }
++      }
++
++      pdev = platform_device_alloc(DRVNAME, cpu);
++      if (!pdev) {
++              err = -ENOMEM;
++              pr_err("Device allocation failed\n");
++              goto exit;
++      }
++
++      pdev_entry = info.pdev_entry;
++      platform_set_drvdata(pdev, pdev_entry);
++      pdev_entry->pdev = pdev;
++
++      err = platform_device_add(pdev);
++      if (err) {
++              pr_err("Device addition failed (%d)\n", err);
++              goto exit_device_put;
++      }
++
++      list_add_tail(&pdev_entry->list, &pdev_list);
++      mutex_unlock(&pdev_list_mutex);
++
++      return 0;
++
++exit_device_put:
++      platform_device_put(pdev);
++exit:
++      mutex_unlock(&pdev_list_mutex);
++exit_entry_free:
++      kfree(info.pdev_entry);
++      return err;
++}
++
++static void coretemp_device_remove(unsigned int cpu)
++{
++      struct pdev_entry *p;
++      unsigned int i;
++
++      mutex_lock(&pdev_list_mutex);
++      list_for_each_entry(p, &pdev_list, list) {
++              if (p->pdev->id != cpu)
++                      continue;
++
++              platform_device_unregister(p->pdev);
++              list_del(&p->list);
++              mutex_unlock(&pdev_list_mutex);
++              for (i = 0; ; ++i) {
++                      u32 cpu_core_id, phys_proc_id;
++                      int err;
++
++                      if (i == cpu)
++                              continue;
++                      err = xen_get_topology_info(i, &cpu_core_id,
++                                                  &phys_proc_id, NULL);
++                      if (err == -ENOENT)
++                              continue;
++                      if (err)
++                              break;
++                      if (phys_proc_id != p->phys_proc_id ||
++                          cpu_core_id != p->cpu_core_id)
++                              continue;
++                      if (!coretemp_device_add(i))
++                              break;
++              }
++              kfree(p);
++              return;
++      }
++      mutex_unlock(&pdev_list_mutex);
++}
++
++static int coretemp_cpu_callback(struct notifier_block *nfb,
++                               unsigned long action, void *hcpu)
++{
++      unsigned int cpu = (unsigned long) hcpu;
++
++      switch (action) {
++      case CPU_ONLINE:
++              coretemp_device_add(cpu);
++              break;
++      case CPU_DEAD:
++              coretemp_device_remove(cpu);
++              break;
++      }
++      return NOTIFY_OK;
++}
++
++static struct notifier_block coretemp_cpu_notifier = {
++      .notifier_call = coretemp_cpu_callback,
++};
++
++static int __init coretemp_init(void)
++{
++      int err = -ENODEV;
++
++      if (!is_initial_xendomain())
++              goto exit;
++
++      /* quick check if we run Intel */
++      if (cpu_data(0).x86_vendor != X86_VENDOR_INTEL)
++              goto exit;
++
++      err = platform_driver_register(&coretemp_driver);
++      if (err)
++              goto exit;
++
++      err = register_pcpu_notifier(&coretemp_cpu_notifier);
++      if (err)
++              goto exit_driver_unreg;
++
++#ifndef CONFIG_ACPI_HOTPLUG_CPU
++      if (list_empty(&pdev_list)) {
++              unregister_pcpu_notifier(&coretemp_cpu_notifier);
++              err = -ENODEV;
++              goto exit_driver_unreg;
++      }
++#endif
++
++      return 0;
++
++exit_driver_unreg:
++      platform_driver_unregister(&coretemp_driver);
++exit:
++      return err;
++}
++
++static void __exit coretemp_exit(void)
++{
++      struct pdev_entry *p, *n;
++
++      unregister_pcpu_notifier(&coretemp_cpu_notifier);
++      mutex_lock(&pdev_list_mutex);
++      list_for_each_entry_safe(p, n, &pdev_list, list) {
++              platform_device_unregister(p->pdev);
++              list_del(&p->list);
++              kfree(p);
++      }
++      mutex_unlock(&pdev_list_mutex);
++      platform_driver_unregister(&coretemp_driver);
++}
++
++MODULE_AUTHOR("Rudolf Marek <r.marek@assembler.cz>");
++MODULE_DESCRIPTION("Intel Core temperature monitor");
++MODULE_LICENSE("GPL");
++
++module_init(coretemp_init)
++module_exit(coretemp_exit)
diff --cc drivers/hwmon/pkgtemp-xen.c

index 0000000,0000000..35ef584

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/hwmon/pkgtemp-xen.c
@@@ -1,0 -1,0 +1,453 @@@
++/*
++ * pkgtemp.c - Linux kernel module for processor package hardware monitoring
++ *
++ * Copyright (C) 2010 Fenghua Yu <fenghua.yu@intel.com>
++ *
++ * Inspired from many hwmon drivers especially coretemp.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; version 2 of the License.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
++ * 02110-1301 USA.
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/hwmon.h>
++#include <linux/sysfs.h>
++#include <linux/hwmon-sysfs.h>
++#include <linux/err.h>
++#include <linux/mutex.h>
++#include <linux/list.h>
++#include <linux/platform_device.h>
++#include <asm/msr.h>
++#include <xen/pcpu.h>
++#include "../xen/core/domctl.h"
++
++#define DRVNAME       "pkgtemp"
++#define pkgtemp_data pdev_entry
++
++enum { SHOW_TEMP, SHOW_TJMAX, SHOW_TTARGET, SHOW_LABEL, SHOW_NAME };
++
++/*
++ * Functions declaration
++ */
++
++static struct pkgtemp_data *pkgtemp_update_device(struct device *dev);
++
++struct pdev_entry {
++      struct list_head list;
++      struct platform_device *pdev;
++      struct device *hwmon_dev;
++      struct mutex update_lock;
++      const char *name;
++      u32 phys_proc_id;
++      char valid;             /* zero until following fields are valid */
++      unsigned long last_updated;     /* in jiffies */
++      int temp;
++      int tjmax;
++      int ttarget;
++      u8 alarm;
++};
++
++/*
++ * Sysfs stuff
++ */
++
++static ssize_t show_name(struct device *dev, struct device_attribute
++                        *devattr, char *buf)
++{
++      int ret;
++      struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
++      struct pkgtemp_data *data = dev_get_drvdata(dev);
++
++      if (attr->index == SHOW_NAME)
++              ret = sprintf(buf, "%s\n", data->name);
++      else    /* show label */
++              ret = sprintf(buf, "physical id %d\n",
++                            data->phys_proc_id);
++      return ret;
++}
++
++static ssize_t show_alarm(struct device *dev, struct device_attribute
++                        *devattr, char *buf)
++{
++      struct pkgtemp_data *data = pkgtemp_update_device(dev);
++      /* read the Out-of-spec log, never clear */
++      return sprintf(buf, "%d\n", data->alarm);
++}
++
++static ssize_t show_temp(struct device *dev,
++                       struct device_attribute *devattr, char *buf)
++{
++      struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
++      struct pkgtemp_data *data = pkgtemp_update_device(dev);
++      int err = 0;
++
++      if (attr->index == SHOW_TEMP)
++              err = data->valid ? sprintf(buf, "%d\n", data->temp) : -EAGAIN;
++      else if (attr->index == SHOW_TJMAX)
++              err = sprintf(buf, "%d\n", data->tjmax);
++      else
++              err = sprintf(buf, "%d\n", data->ttarget);
++      return err;
++}
++
++static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, show_temp, NULL, SHOW_TEMP);
++static SENSOR_DEVICE_ATTR(temp1_crit, S_IRUGO, show_temp, NULL, SHOW_TJMAX);
++static SENSOR_DEVICE_ATTR(temp1_max, S_IRUGO, show_temp, NULL, SHOW_TTARGET);
++static DEVICE_ATTR(temp1_crit_alarm, S_IRUGO, show_alarm, NULL);
++static SENSOR_DEVICE_ATTR(temp1_label, S_IRUGO, show_name, NULL, SHOW_LABEL);
++static SENSOR_DEVICE_ATTR(name, S_IRUGO, show_name, NULL, SHOW_NAME);
++
++static struct attribute *pkgtemp_attributes[] = {
++      &sensor_dev_attr_name.dev_attr.attr,
++      &sensor_dev_attr_temp1_label.dev_attr.attr,
++      &dev_attr_temp1_crit_alarm.attr,
++      &sensor_dev_attr_temp1_input.dev_attr.attr,
++      &sensor_dev_attr_temp1_crit.dev_attr.attr,
++      NULL
++};
++
++static const struct attribute_group pkgtemp_group = {
++      .attrs = pkgtemp_attributes,
++};
++
++static struct pkgtemp_data *pkgtemp_update_device(struct device *dev)
++{
++      struct pkgtemp_data *data = dev_get_drvdata(dev);
++      int err;
++
++      mutex_lock(&data->update_lock);
++
++      if (!data->valid || time_after(jiffies, data->last_updated + HZ)) {
++              u32 eax, edx;
++
++              data->valid = 0;
++              err = rdmsr_safe_on_pcpu(data->pdev->id,
++                                       MSR_IA32_PACKAGE_THERM_STATUS,
++                                       &eax, &edx);
++              if (err >= 0) {
++                      data->alarm = (eax >> 5) & 1;
++                      data->temp = data->tjmax - (((eax >> 16)
++                                                      & 0x7f) * 1000);
++                      data->valid = 1;
++              } else
++                      dev_dbg(dev, "Temperature data invalid (0x%x)\n", eax);
++
++              data->last_updated = jiffies;
++      }
++
++      mutex_unlock(&data->update_lock);
++      return data;
++}
++
++static int get_tjmax(int cpu, struct device *dev)
++{
++      int default_tjmax = 100000;
++      int err;
++      u32 eax, edx;
++      u32 val;
++
++      /* IA32_TEMPERATURE_TARGET contains the TjMax value */
++      err = rdmsr_safe_on_pcpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
++      if (err >= 0) {
++              val = (eax >> 16) & 0xff;
++              if ((val > 80) && (val < 120)) {
++                      dev_info(dev, "TjMax is %d C.\n", val);
++                      return val * 1000;
++              }
++      }
++      dev_warn(dev, "Unable to read TjMax from CPU.\n");
++      return default_tjmax;
++}
++
++static int pkgtemp_probe(struct platform_device *pdev)
++{
++      struct pkgtemp_data *data = platform_get_drvdata(pdev);
++      int err;
++      u32 eax, edx;
++
++      data->name = "pkgtemp";
++      mutex_init(&data->update_lock);
++
++      /* test if we can access the THERM_STATUS MSR */
++      err = rdmsr_safe_on_pcpu(pdev->id, MSR_IA32_PACKAGE_THERM_STATUS,
++                               &eax, &edx);
++      if (err < 0) {
++              dev_err(&pdev->dev,
++                      "Unable to access THERM_STATUS MSR, giving up\n");
++              return err;
++      }
++
++      data->tjmax = get_tjmax(pdev->id, &pdev->dev);
++
++      err = rdmsr_safe_on_pcpu(pdev->id, MSR_IA32_TEMPERATURE_TARGET,
++                               &eax, &edx);
++      if (err < 0) {
++              dev_warn(&pdev->dev, "Unable to read"
++                              " IA32_TEMPERATURE_TARGET MSR\n");
++      } else {
++              data->ttarget = data->tjmax - (((eax >> 8) & 0xff) * 1000);
++              err = device_create_file(&pdev->dev,
++                              &sensor_dev_attr_temp1_max.dev_attr);
++              if (err)
++                      return err;
++      }
++
++      err = sysfs_create_group(&pdev->dev.kobj, &pkgtemp_group);
++      if (err)
++              goto exit_dev;
++
++      data->hwmon_dev = hwmon_device_register(&pdev->dev);
++      if (IS_ERR(data->hwmon_dev)) {
++              err = PTR_ERR(data->hwmon_dev);
++              dev_err(&pdev->dev, "Class registration failed (%d)\n",
++                      err);
++              goto exit_class;
++      }
++
++      return 0;
++
++exit_class:
++      sysfs_remove_group(&pdev->dev.kobj, &pkgtemp_group);
++exit_dev:
++      device_remove_file(&pdev->dev, &sensor_dev_attr_temp1_max.dev_attr);
++      return err;
++}
++
++static int pkgtemp_remove(struct platform_device *pdev)
++{
++      struct pkgtemp_data *data = platform_get_drvdata(pdev);
++
++      hwmon_device_unregister(data->hwmon_dev);
++      sysfs_remove_group(&pdev->dev.kobj, &pkgtemp_group);
++      device_remove_file(&pdev->dev, &sensor_dev_attr_temp1_max.dev_attr);
++      return 0;
++}
++
++static struct platform_driver pkgtemp_driver = {
++      .driver = {
++              .owner = THIS_MODULE,
++              .name = DRVNAME,
++      },
++      .probe = pkgtemp_probe,
++      .remove = pkgtemp_remove,
++};
++
++static LIST_HEAD(pdev_list);
++static DEFINE_MUTEX(pdev_list_mutex);
++
++struct cpu_info {
++      u32 cpuid_6_eax;
++};
++
++static void get_cpuid_info(void *arg)
++{
++      struct cpu_info *info = arg;
++
++      info->cpuid_6_eax = cpuid_eax(0) >= 6 ? cpuid_eax(6) : 0;
++}
++
++static int pkgtemp_device_add(unsigned int cpu)
++{
++      int err;
++      struct cpu_info info;
++      struct platform_device *pdev;
++      struct pdev_entry *pdev_entry, *entry;
++
++      err = xen_set_physical_cpu_affinity(cpu);
++      if (!err) {
++              get_cpuid_info(&info);
++              WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
++      } else if (err > 0) {
++              static bool warned;
++
++              if (!warned) {
++                      warned = true;
++                      printk(KERN_WARNING DRVNAME
++                             "Cannot set physical CPU affinity"
++                             " (assuming use of dom0_vcpus_pin)\n");
++              }
++              err = smp_call_function_single(cpu, get_cpuid_info, &info, 1);
++      }
++      if (err)
++              return err;
++
++      if (!(info.cpuid_6_eax & 0x40))
++              return 0;
++
++      pdev_entry = kzalloc(sizeof(struct pdev_entry), GFP_KERNEL);
++      if (!pdev_entry)
++              return -ENOMEM;
++
++      err = xen_get_topology_info(cpu, NULL,
++                                  &pdev_entry->phys_proc_id, NULL);
++      if (err)
++              goto exit_entry_free;
++
++      mutex_lock(&pdev_list_mutex);
++
++      /* Only keep the first entry in each package */
++      list_for_each_entry(entry, &pdev_list, list) {
++              if (entry->phys_proc_id == pdev_entry->phys_proc_id) {
++                      err = 0;        /* Not an error */
++                      goto exit;
++              }
++      }
++
++      pdev = platform_device_alloc(DRVNAME, cpu);
++      if (!pdev) {
++              err = -ENOMEM;
++              pr_err("Device allocation failed\n");
++              goto exit;
++      }
++
++      platform_set_drvdata(pdev, pdev_entry);
++      pdev_entry->pdev = pdev;
++
++      err = platform_device_add(pdev);
++      if (err) {
++              pr_err("Device addition failed (%d)\n", err);
++              goto exit_device_put;
++      }
++
++      list_add_tail(&pdev_entry->list, &pdev_list);
++      mutex_unlock(&pdev_list_mutex);
++
++      return 0;
++
++exit_device_put:
++      platform_device_put(pdev);
++exit:
++      mutex_unlock(&pdev_list_mutex);
++exit_entry_free:
++      kfree(pdev_entry);
++      return err;
++}
++
++static void pkgtemp_device_remove(unsigned int cpu)
++{
++      struct pdev_entry *p;
++      unsigned int i;
++
++      mutex_lock(&pdev_list_mutex);
++      list_for_each_entry(p, &pdev_list, list) {
++              if (p->pdev->id != cpu)
++                      continue;
++
++              platform_device_unregister(p->pdev);
++              list_del(&p->list);
++              mutex_unlock(&pdev_list_mutex);
++              for (i = 0; ; ++i) {
++                      u32 phys_proc_id;
++                      int err;
++
++                      if (i == cpu)
++                              continue;
++                      err = xen_get_topology_info(i, NULL, &phys_proc_id,
++                                                  NULL);
++                      if (err == -ENOENT)
++                              continue;
++                      if (err)
++                              break;
++                      if (phys_proc_id != p->phys_proc_id)
++                              continue;
++                      if (!pkgtemp_device_add(i))
++                              break;
++              }
++              kfree(p);
++              return;
++      }
++      mutex_unlock(&pdev_list_mutex);
++}
++
++static int pkgtemp_cpu_callback(struct notifier_block *nfb,
++                              unsigned long action, void *hcpu)
++{
++      unsigned int cpu = (unsigned long) hcpu;
++
++      switch (action) {
++      case CPU_ONLINE:
++              pkgtemp_device_add(cpu);
++              break;
++      case CPU_DEAD:
++              pkgtemp_device_remove(cpu);
++              break;
++      }
++      return NOTIFY_OK;
++}
++
++static struct notifier_block pkgtemp_cpu_notifier = {
++      .notifier_call = pkgtemp_cpu_callback,
++};
++
++static int __init pkgtemp_init(void)
++{
++      int err = -ENODEV;
++
++      if (!is_initial_xendomain())
++              goto exit;
++
++      /* quick check if we run Intel */
++      if (cpu_data(0).x86_vendor != X86_VENDOR_INTEL)
++              goto exit;
++
++      err = platform_driver_register(&pkgtemp_driver);
++      if (err)
++              goto exit;
++
++      err = register_pcpu_notifier(&pkgtemp_cpu_notifier);
++      if (err)
++              goto exit_driver_unreg;
++
++#ifndef CONFIG_ACPI_HOTPLUG_CPU
++      if (list_empty(&pdev_list)) {
++              unregister_pcpu_notifier(&pkgtemp_cpu_notifier);
++              err = -ENODEV;
++              goto exit_driver_unreg;
++      }
++#endif
++
++      return 0;
++
++exit_driver_unreg:
++      platform_driver_unregister(&pkgtemp_driver);
++exit:
++      return err;
++}
++
++static void __exit pkgtemp_exit(void)
++{
++      struct pdev_entry *p, *n;
++
++      unregister_pcpu_notifier(&pkgtemp_cpu_notifier);
++      mutex_lock(&pdev_list_mutex);
++      list_for_each_entry_safe(p, n, &pdev_list, list) {
++              platform_device_unregister(p->pdev);
++              list_del(&p->list);
++              kfree(p);
++      }
++      mutex_unlock(&pdev_list_mutex);
++      platform_driver_unregister(&pkgtemp_driver);
++}
++
++MODULE_AUTHOR("Fenghua Yu <fenghua.yu@intel.com>");
++MODULE_DESCRIPTION("Intel processor package temperature monitor");
++MODULE_LICENSE("GPL");
++
++module_init(pkgtemp_init)
++module_exit(pkgtemp_exit)
diff --cc drivers/hwmon/via-cputemp-xen.c

index 0000000,0000000..1608158

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/hwmon/via-cputemp-xen.c
@@@ -1,0 -1,0 +1,356 @@@
++/*
++ * via-cputemp.c - Driver for VIA CPU core temperature monitoring
++ * Copyright (C) 2009 VIA Technologies, Inc.
++ *
++ * based on existing coretemp.c, which is
++ *
++ * Copyright (C) 2007 Rudolf Marek <r.marek@assembler.cz>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; version 2 of the License.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
++ * 02110-1301 USA.
++ */
++
++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/hwmon.h>
++#include <linux/sysfs.h>
++#include <linux/hwmon-sysfs.h>
++#include <linux/err.h>
++#include <linux/mutex.h>
++#include <linux/list.h>
++#include <linux/platform_device.h>
++#include <asm/msr.h>
++#include <xen/pcpu.h>
++#include "../xen/core/domctl.h"
++
++#define DRVNAME       "via_cputemp"
++
++enum { SHOW_TEMP, SHOW_LABEL, SHOW_NAME };
++
++/*
++ * Functions declaration
++ */
++
++struct pdev_entry {
++      struct list_head list;
++      struct platform_device *pdev;
++      struct device *hwmon_dev;
++      const char *name;
++      u8 x86_model;
++      u32 msr;
++};
++#define via_cputemp_data pdev_entry
++
++/*
++ * Sysfs stuff
++ */
++
++static ssize_t show_name(struct device *dev, struct device_attribute
++                        *devattr, char *buf)
++{
++      int ret;
++      struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
++      struct via_cputemp_data *data = dev_get_drvdata(dev);
++
++      if (attr->index == SHOW_NAME)
++              ret = sprintf(buf, "%s\n", data->name);
++      else    /* show label */
++              ret = sprintf(buf, "Core %d\n", data->pdev->id);
++      return ret;
++}
++
++static ssize_t show_temp(struct device *dev,
++                       struct device_attribute *devattr, char *buf)
++{
++      struct via_cputemp_data *data = dev_get_drvdata(dev);
++      u32 eax, edx;
++      int err;
++
++      err = rdmsr_safe_on_pcpu(data->pdev->id, data->msr, &eax, &edx);
++      if (err < 0)
++              return -EAGAIN;
++
++      return sprintf(buf, "%lu\n", ((unsigned long)eax & 0xffffff) * 1000);
++}
++
++static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, show_temp, NULL,
++                        SHOW_TEMP);
++static SENSOR_DEVICE_ATTR(temp1_label, S_IRUGO, show_name, NULL, SHOW_LABEL);
++static SENSOR_DEVICE_ATTR(name, S_IRUGO, show_name, NULL, SHOW_NAME);
++
++static struct attribute *via_cputemp_attributes[] = {
++      &sensor_dev_attr_name.dev_attr.attr,
++      &sensor_dev_attr_temp1_label.dev_attr.attr,
++      &sensor_dev_attr_temp1_input.dev_attr.attr,
++      NULL
++};
++
++static const struct attribute_group via_cputemp_group = {
++      .attrs = via_cputemp_attributes,
++};
++
++static int via_cputemp_probe(struct platform_device *pdev)
++{
++      struct via_cputemp_data *data = platform_get_drvdata(pdev);
++      int err;
++      u32 eax, edx;
++
++      data->name = "via_cputemp";
++
++      switch (data->x86_model) {
++      case 0xA:
++              /* C7 A */
++      case 0xD:
++              /* C7 D */
++              data->msr = 0x1169;
++              break;
++      case 0xF:
++              /* Nano */
++              data->msr = 0x1423;
++              break;
++      default:
++              return -ENODEV;
++      }
++
++      /* test if we can access the TEMPERATURE MSR */
++      err = rdmsr_safe_on_pcpu(pdev->id, data->msr, &eax, &edx);
++      if (err >= 0) {
++              dev_err(&pdev->dev,
++                      "Unable to access TEMPERATURE MSR, giving up\n");
++              return err;
++      }
++
++      err = sysfs_create_group(&pdev->dev.kobj, &via_cputemp_group);
++      if (err)
++              return err;
++
++      data->hwmon_dev = hwmon_device_register(&pdev->dev);
++      if (IS_ERR(data->hwmon_dev)) {
++              err = PTR_ERR(data->hwmon_dev);
++              dev_err(&pdev->dev, "Class registration failed (%d)\n",
++                      err);
++              goto exit_remove;
++      }
++
++      return 0;
++
++exit_remove:
++      sysfs_remove_group(&pdev->dev.kobj, &via_cputemp_group);
++      return err;
++}
++
++static int via_cputemp_remove(struct platform_device *pdev)
++{
++      struct via_cputemp_data *data = platform_get_drvdata(pdev);
++
++      hwmon_device_unregister(data->hwmon_dev);
++      sysfs_remove_group(&pdev->dev.kobj, &via_cputemp_group);
++      return 0;
++}
++
++static struct platform_driver via_cputemp_driver = {
++      .driver = {
++              .owner = THIS_MODULE,
++              .name = DRVNAME,
++      },
++      .probe = via_cputemp_probe,
++      .remove = via_cputemp_remove,
++};
++
++static LIST_HEAD(pdev_list);
++static DEFINE_MUTEX(pdev_list_mutex);
++
++struct cpu_info {
++      struct pdev_entry *pdev_entry;
++      u8 x86;
++};
++
++static void get_cpuid_info(void *arg)
++{
++      struct cpu_info *info = arg;
++      struct pdev_entry *pdev_entry = info->pdev_entry;
++      u32 val = cpuid_eax(1);
++
++      info->x86 = ((val >> 8) & 0xf) + ((val >> 20) & 0xff);
++      pdev_entry->x86_model = ((val >> 4) & 0xf) | ((val >> 12) & 0xf0);
++}
++
++static int via_cputemp_device_add(unsigned int cpu)
++{
++      int err;
++      struct cpu_info info;
++      struct platform_device *pdev;
++      struct pdev_entry *pdev_entry;
++
++      pdev_entry = kzalloc(sizeof(*pdev_entry), GFP_KERNEL);
++      if (!pdev_entry)
++              return -ENOMEM;
++
++      info.pdev_entry = pdev_entry;
++      err = xen_set_physical_cpu_affinity(cpu);
++      if (!err) {
++              get_cpuid_info(&info);
++              WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
++      } else if (err > 0) {
++              static bool warned;
++
++              if (!warned) {
++                      warned = true;
++                      printk(KERN_WARNING DRVNAME
++                             "Cannot set physical CPU affinity"
++                             " (assuming use of dom0_vcpus_pin)\n");
++              }
++              err = smp_call_function_single(cpu, get_cpuid_info, &info, 1);
++      }
++      if (err)
++              goto exit_entry_free;
++
++      if (info.x86 != 6)
++              goto exit_entry_free;
++
++      if (pdev_entry->x86_model < 0x0a)
++              goto exit_entry_free;
++
++      if (pdev_entry->x86_model > 0x0f) {
++              pr_warn("Unknown CPU model 0x%x\n", pdev_entry->x86_model);
++              goto exit_entry_free;
++      }
++
++      pdev = platform_device_alloc(DRVNAME, cpu);
++      if (!pdev) {
++              err = -ENOMEM;
++              pr_err("Device allocation failed\n");
++              goto exit_entry_free;
++      }
++
++      platform_set_drvdata(pdev, pdev_entry);
++      pdev_entry->pdev = pdev;
++
++      err = platform_device_add(pdev);
++      if (err) {
++              pr_err("Device addition failed (%d)\n", err);
++              goto exit_device_put;
++      }
++
++      mutex_lock(&pdev_list_mutex);
++      list_add_tail(&pdev_entry->list, &pdev_list);
++      mutex_unlock(&pdev_list_mutex);
++
++      return 0;
++
++exit_device_put:
++      platform_device_put(pdev);
++exit_entry_free:
++      kfree(pdev_entry);
++      return err;
++}
++
++static void via_cputemp_device_remove(unsigned int cpu)
++{
++      struct pdev_entry *p;
++
++      mutex_lock(&pdev_list_mutex);
++      list_for_each_entry(p, &pdev_list, list) {
++              if (p->pdev->id == cpu) {
++                      platform_device_unregister(p->pdev);
++                      list_del(&p->list);
++                      mutex_unlock(&pdev_list_mutex);
++                      kfree(p);
++                      return;
++              }
++      }
++      mutex_unlock(&pdev_list_mutex);
++}
++
++static int via_cputemp_cpu_callback(struct notifier_block *nfb,
++                               unsigned long action, void *hcpu)
++{
++      unsigned int cpu = (unsigned long) hcpu;
++
++      switch (action) {
++      case CPU_ONLINE:
++              via_cputemp_device_add(cpu);
++              break;
++      case CPU_DEAD:
++              via_cputemp_device_remove(cpu);
++              break;
++      }
++      return NOTIFY_OK;
++}
++
++static struct notifier_block via_cputemp_cpu_notifier = {
++      .notifier_call = via_cputemp_cpu_callback,
++};
++
++static int __init via_cputemp_init(void)
++{
++      int err;
++
++      if (!is_initial_xendomain())
++              return -ENODEV;
++
++      if (cpu_data(0).x86_vendor != X86_VENDOR_CENTAUR) {
++              printk(KERN_DEBUG DRVNAME ": Not a VIA CPU\n");
++              err = -ENODEV;
++              goto exit;
++      }
++
++      err = platform_driver_register(&via_cputemp_driver);
++      if (err)
++              goto exit;
++
++      err = register_pcpu_notifier(&via_cputemp_cpu_notifier);
++      if (err)
++              goto exit_driver_unreg;
++
++#ifndef CONFIG_ACPI_HOTPLUG_CPU
++      if (list_empty(&pdev_list)) {
++              unregister_pcpu_notifier(&via_cputemp_cpu_notifier);
++              err = -ENODEV;
++              goto exit_driver_unreg;
++      }
++#endif
++
++      return 0;
++
++exit_driver_unreg:
++      platform_driver_unregister(&via_cputemp_driver);
++exit:
++      return err;
++}
++
++static void __exit via_cputemp_exit(void)
++{
++      struct pdev_entry *p, *n;
++
++      unregister_pcpu_notifier(&via_cputemp_cpu_notifier);
++      mutex_lock(&pdev_list_mutex);
++      list_for_each_entry_safe(p, n, &pdev_list, list) {
++              platform_device_unregister(p->pdev);
++              list_del(&p->list);
++              kfree(p);
++      }
++      mutex_unlock(&pdev_list_mutex);
++      platform_driver_unregister(&via_cputemp_driver);
++}
++
++MODULE_AUTHOR("Harald Welte <HaraldWelte@viatech.com>");
++MODULE_DESCRIPTION("VIA CPU temperature monitor");
++MODULE_LICENSE("GPL");
++
++module_init(via_cputemp_init)
++module_exit(via_cputemp_exit)
diff --cc drivers/ide/ide-lib.c

index e386a32,e386a32..b639ac2
--- 1/drivers/ide/ide-lib.c
--- 2/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@@ -18,6 -18,6 +18,7 @@@ void ide_toggle_bounce(ide_drive_t *dri
   {
         u64 addr = BLK_BOUNCE_HIGH;     /* dma64_addr_t */
   
++#ifndef CONFIG_XEN
         if (!PCI_DMA_BUS_IS_PHYS) {
                 addr = BLK_BOUNCE_ANY;
         } else if (on && drive->media == ide_disk) {
@@@ -26,6 -26,6 +27,16 @@@
                 if (dev && dev->dma_mask)
                         addr = *dev->dma_mask;
         }
++#else
++      if (on && drive->media == ide_disk) {
++              struct device *dev = drive->hwif->dev;
++
++              if (!PCI_DMA_BUS_IS_PHYS)
++                      addr = BLK_BOUNCE_ANY;
++              else if (dev && dev->dma_mask)
++                      addr = *dev->dma_mask;
++      }
++#endif
   
         if (drive->queue)
                 blk_queue_bounce_limit(drive->queue, addr);
diff --cc drivers/idle/Kconfig

index 8489eb5,8489eb5..9d643c1
--- 1/drivers/idle/Kconfig
--- 2/drivers/idle/Kconfig
+++ b/drivers/idle/Kconfig
@@@ -10,7 -10,7 +10,7 @@@ config INTEL_IDL
           processors intel_idle does not support.
   
   menu "Memory power savings"
--depends on X86_64
++depends on X86_64 && !XEN
   
   config I7300_IDLE_IOAT_CHANNEL
         bool
diff --cc drivers/input/misc/xen-kbdfront.c

index 62bae99,62bae99..0c14d51
--- 1/drivers/input/misc/xen-kbdfront.c
--- 2/drivers/input/misc/xen-kbdfront.c
+++ b/drivers/input/misc/xen-kbdfront.c
@@@ -363,7 -363,7 +363,6 @@@ static const struct xenbus_device_id xe
   
   static struct xenbus_driver xenkbd_driver = {
         .name = "vkbd",
--      .owner = THIS_MODULE,
         .ids = xenkbd_ids,
         .probe = xenkbd_probe,
         .remove = xenkbd_remove,
diff --cc drivers/input/touchscreen/Kconfig
Simple merge
diff --cc drivers/input/touchscreen/Makefile
Simple merge
diff --cc drivers/isdn/mISDN/socket.c
Simple merge
diff --cc drivers/md/dm-mpath.c
Simple merge
diff --cc drivers/md/dm-raid45.c

index 5312a16,0000000..3a1a10d

mode 100644,000000..100644
--- 1/drivers/md/dm-raid45.c
--- /dev/null
+++ b/drivers/md/dm-raid45.c
@@@ -1,4691 -1,0 +1,4696 @@@
+ +/*
+ + * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
+ + *
+ + * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
+ + *
+ + * This file is released under the GPL.
+ + *
+ + *
+ + * Linux 2.6 Device Mapper RAID4 and RAID5 target.
+ + *
+ + * Tested-by: Intel; Marcin.Labun@intel.com, krzysztof.wojcik@intel.com
+ + *
+ + *
+ + * Supports the following ATARAID vendor solutions (and SNIA DDF):
+ + *
+ + *    Adaptec HostRAID ASR
+ + *    SNIA DDF1
+ + *    Hiphpoint 37x
+ + *    Hiphpoint 45x
+ + *    Intel IMSM
+ + *    Jmicron ATARAID
+ + *    LSI Logic MegaRAID
+ + *    NVidia RAID
+ + *    Promise FastTrack
+ + *    Silicon Image Medley
+ + *    VIA Software RAID
+ + *
+ + * via the dmraid application.
+ + *
+ + *
+ + * Features:
+ + *
+ + *    o RAID4 with dedicated and selectable parity device
+ + *    o RAID5 with rotating parity (left+right, symmetric+asymmetric)
+ + *    o recovery of out of sync device for initial
+ + *      RAID set creation or after dead drive replacement
+ + *    o run time optimization of xor algorithm used to calculate parity
+ + *
+ + *
+ + * Thanks to MD for:
+ + *    o the raid address calculation algorithm
+ + *    o the base of the biovec <-> page list copier.
+ + *
+ + *
+ + * Uses region hash to keep track of how many writes are in flight to
+ + * regions in order to use dirty log to keep state of regions to recover:
+ + *
+ + *    o clean regions (those which are synchronized
+ + *    and don't have write io in flight)
+ + *    o dirty regions (those with write io in flight)
+ + *
+ + *
+ + * On startup, any dirty regions are migrated to the
+ + * 'nosync' state and are subject to recovery by the daemon.
+ + *
+ + * See raid_ctr() for table definition.
+ + *
+ + * ANALYZEME: recovery bandwidth
+ + */
+ +
+ +static const char *version = "v0.2597k";
+ +
+ +#include "dm.h"
+ +#include "dm-memcache.h"
+ +#include "dm-raid45.h"
+ +
+ +#include <linux/kernel.h>
+ +#include <linux/vmalloc.h>
+ +#include <linux/raid/xor.h>
+ +#include <linux/slab.h>
+ +
+ +#include <linux/bio.h>
+ +#include <linux/dm-io.h>
+ +#include <linux/dm-dirty-log.h>
+ +#include <linux/dm-region-hash.h>
+ +
+ +
+ +/*
+ + * Configurable parameters
+ + */
+ +
+ +/* Minimum/maximum and default # of selectable stripes. */
+ +#define       STRIPES_MIN             8
+ +#define       STRIPES_MAX             16384
+ +#define       STRIPES_DEFAULT         80
+ +
+ +/* Maximum and default chunk size in sectors if not set in constructor. */
+ +#define       CHUNK_SIZE_MIN          8
+ +#define       CHUNK_SIZE_MAX          16384
+ +#define       CHUNK_SIZE_DEFAULT      64
+ +
+ +/* Default io size in sectors if not set in constructor. */
+ +#define       IO_SIZE_MIN             CHUNK_SIZE_MIN
+ +#define       IO_SIZE_DEFAULT         IO_SIZE_MIN
+ +
+ +/* Recover io size default in sectors. */
+ +#define       RECOVER_IO_SIZE_MIN             64
+ +#define       RECOVER_IO_SIZE_DEFAULT         256
+ +
+ +/* Default, minimum and maximum percentage of recover io bandwidth. */
+ +#define       BANDWIDTH_DEFAULT       10
+ +#define       BANDWIDTH_MIN           1
+ +#define       BANDWIDTH_MAX           100
+ +
+ +/* # of parallel recovered regions */
+ +#define RECOVERY_STRIPES_MIN  1
+ +#define RECOVERY_STRIPES_MAX  64
+ +#define RECOVERY_STRIPES_DEFAULT      RECOVERY_STRIPES_MIN
+ +/*
+ + * END Configurable parameters
+ + */
+ +
+ +#define       TARGET  "dm-raid45"
+ +#define       DAEMON  "kraid45d"
+ +#define       DM_MSG_PREFIX   TARGET
+ +
+ +#define       SECTORS_PER_PAGE        (PAGE_SIZE >> SECTOR_SHIFT)
+ +
+ +/* Amount/size for __xor(). */
+ +#define       XOR_SIZE        PAGE_SIZE
+ +
+ +/* Ticks to run xor_speed() test for. */
+ +#define       XOR_SPEED_TICKS 5
+ +
+ +/* Check value in range. */
+ +#define       range_ok(i, min, max)   (i >= min && i <= max)
+ +
+ +/* Structure access macros. */
+ +/* Derive raid_set from stripe_cache pointer. */
+ +#define       RS(x)   container_of(x, struct raid_set, sc)
+ +
+ +/* Page reference. */
+ +#define PAGE(stripe, p)  ((stripe)->obj[p].pl->page)
+ +
+ +/* Stripe chunk reference. */
+ +#define CHUNK(stripe, p) ((stripe)->chunk + p)
+ +
+ +/* Bio list reference. */
+ +#define       BL(stripe, p, rw)       (stripe->chunk[p].bl + rw)
+ +#define       BL_CHUNK(chunk, rw)     (chunk->bl + rw)
+ +
+ +/* Page list reference. */
+ +#define       PL(stripe, p)           (stripe->obj[p].pl)
+ +/* END: structure access macros. */
+ +
+ +/* Factor out to dm-bio-list.h */
+ +static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
+ +{
+ +      bio->bi_next = bl->head;
+ +      bl->head = bio;
+ +
+ +      if (!bl->tail)
+ +              bl->tail = bio;
+ +}
+ +
+ +/* Factor out to dm.h */
+ +#define TI_ERR_RET(str, ret) \
+ +      do { ti->error = str; return ret; } while (0);
+ +#define TI_ERR(str)     TI_ERR_RET(str, -EINVAL)
+ +
+ +/* Macro to define access IO flags access inline functions. */
+ +#define       BITOPS(name, what, var, flag) \
+ +static inline int TestClear ## name ## what(struct var *v) \
+ +{ return test_and_clear_bit(flag, &v->io.flags); } \
+ +static inline int TestSet ## name ## what(struct var *v) \
+ +{ return test_and_set_bit(flag, &v->io.flags); } \
+ +static inline void Clear ## name ## what(struct var *v) \
+ +{ clear_bit(flag, &v->io.flags); } \
+ +static inline void Set ## name ## what(struct var *v) \
+ +{ set_bit(flag, &v->io.flags); } \
+ +static inline int name ## what(struct var *v) \
+ +{ return test_bit(flag, &v->io.flags); }
+ +
+ +/*-----------------------------------------------------------------
+ + * Stripe cache
+ + *
+ + * Cache for all reads and writes to raid sets (operational or degraded)
+ + *
+ + * We need to run all data to and from a RAID set through this cache,
+ + * because parity chunks need to get calculated from data chunks
+ + * or, in the degraded/resynchronization case, missing chunks need
+ + * to be reconstructed using the other chunks of the stripe.
+ + *---------------------------------------------------------------*/
+ +/* Unique kmem cache name suffix # counter. */
+ +static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
+ +
+ +/* A chunk within a stripe (holds bios hanging off). */
+ +/* IO status flags for chunks of a stripe. */
+ +enum chunk_flags {
+ +      CHUNK_DIRTY,            /* Pages of chunk dirty; need writing. */
+ +      CHUNK_ERROR,            /* IO error on any chunk page. */
+ +      CHUNK_IO,               /* Allow/prohibit IO on chunk pages. */
+ +      CHUNK_LOCKED,           /* Chunk pages locked during IO. */
+ +      CHUNK_MUST_IO,          /* Chunk must io. */
+ +      CHUNK_UNLOCK,           /* Enforce chunk unlock. */
+ +      CHUNK_UPTODATE,         /* Chunk pages are uptodate. */
+ +};
+ +
+ +enum bl_type {
+ +      WRITE_QUEUED = WRITE + 1,
+ +      WRITE_MERGED,
+ +      NR_BL_TYPES,    /* Must be last one! */
+ +};
+ +struct stripe_chunk {
+ +      atomic_t cnt;           /* Reference count. */
+ +      struct stripe *stripe;  /* Backpointer to stripe for endio(). */
+ +      /* Bio lists for reads, writes, and writes merged. */
+ +      struct bio_list bl[NR_BL_TYPES];
+ +      struct {
+ +              unsigned long flags; /* IO status flags. */
+ +      } io;
+ +};
+ +
+ +/* Define chunk bit operations. */
+ +BITOPS(Chunk, Dirty,   stripe_chunk, CHUNK_DIRTY)
+ +BITOPS(Chunk, Error,   stripe_chunk, CHUNK_ERROR)
+ +BITOPS(Chunk, Io,      stripe_chunk, CHUNK_IO)
+ +BITOPS(Chunk, Locked,  stripe_chunk, CHUNK_LOCKED)
+ +BITOPS(Chunk, MustIo,  stripe_chunk, CHUNK_MUST_IO)
+ +BITOPS(Chunk, Unlock,  stripe_chunk, CHUNK_UNLOCK)
+ +BITOPS(Chunk, Uptodate,        stripe_chunk, CHUNK_UPTODATE)
+ +
+ +/*
+ + * Stripe linked list indexes. Keep order, because the stripe
+ + * and the stripe cache rely on the first 3!
+ + */
+ +enum list_types {
+ +      LIST_FLUSH,     /* Stripes to flush for io. */
+ +      LIST_ENDIO,     /* Stripes to endio. */
+ +      LIST_LRU,       /* Least recently used stripes. */
+ +      SC_NR_LISTS,    /* # of lists in stripe cache. */
+ +      LIST_HASH = SC_NR_LISTS,        /* Hashed stripes. */
+ +      LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
+ +      STRIPE_NR_LISTS,/* To size array in struct stripe. */
+ +};
+ +
+ +/* Adressing region recovery. */
+ +struct recover_addr {
+ +      struct dm_region *reg;  /* Actual region to recover. */
+ +      sector_t pos;   /* Position within region to recover. */
+ +      sector_t end;   /* End of region to recover. */
+ +};
+ +
+ +/* A stripe: the io object to handle all reads and writes to a RAID set. */
+ +struct stripe {
+ +      atomic_t cnt;                   /* Reference count. */
+ +      struct stripe_cache *sc;        /* Backpointer to stripe cache. */
+ +
+ +      /*
+ +       * 4 linked lists:
+ +       *   o io list to flush io
+ +       *   o endio list
+ +       *   o LRU list to put stripes w/o reference count on
+ +       *   o stripe cache hash
+ +       */
+ +      struct list_head lists[STRIPE_NR_LISTS];
+ +
+ +      sector_t key;    /* Hash key. */
+ +      region_t region; /* Region stripe is mapped to. */
+ +
+ +      struct {
+ +              unsigned long flags;    /* Stripe state flags (see below). */
+ +
+ +              /*
+ +               * Pending ios in flight:
+ +               *
+ +               * used to control move of stripe to endio list
+ +               */
+ +              atomic_t pending;
+ +
+ +              /* Sectors to read and write for multi page stripe sets. */
+ +              unsigned size;
+ +      } io;
+ +
+ +      /* Address region recovery. */
+ +      struct recover_addr *recover;
+ +
+ +      /* Lock on stripe (Future: for clustering). */
+ +      void *lock;
+ +
+ +      struct {
+ +              unsigned short parity;  /* Parity chunk index. */
+ +              short recover;          /* Recovery chunk index. */
+ +      } idx;
+ +
+ +      /*
+ +       * This stripe's memory cache object (dm-mem-cache);
+ +       * i.e. the io chunk pages.
+ +       */
+ +      struct dm_mem_cache_object *obj;
+ +
+ +      /* Array of stripe sets (dynamically allocated). */
+ +      struct stripe_chunk chunk[0];
+ +};
+ +
+ +/* States stripes can be in (flags field). */
+ +enum stripe_states {
+ +      STRIPE_ERROR,           /* io error on stripe. */
+ +      STRIPE_MERGED,          /* Writes got merged to be written. */
+ +      STRIPE_RBW,             /* Read-before-write stripe. */
+ +      STRIPE_RECONSTRUCT,     /* Reconstruct of a missing chunk required. */
+ +      STRIPE_RECONSTRUCTED,   /* Reconstructed of a missing chunk. */
+ +      STRIPE_RECOVER,         /* Stripe used for RAID set recovery. */
+ +};
+ +
+ +/* Define stripe bit operations. */
+ +BITOPS(Stripe, Error,       stripe, STRIPE_ERROR)
+ +BITOPS(Stripe, Merged,        stripe, STRIPE_MERGED)
+ +BITOPS(Stripe, RBW,         stripe, STRIPE_RBW)
+ +BITOPS(Stripe, Reconstruct,   stripe, STRIPE_RECONSTRUCT)
+ +BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED)
+ +BITOPS(Stripe, Recover,             stripe, STRIPE_RECOVER)
+ +
+ +/* A stripe hash. */
+ +struct stripe_hash {
+ +      struct list_head *hash;
+ +      unsigned buckets;
+ +      unsigned mask;
+ +      unsigned prime;
+ +      unsigned shift;
+ +};
+ +
+ +enum sc_lock_types {
+ +      LOCK_ENDIO,     /* Protect endio list. */
+ +      NR_LOCKS,       /* To size array in struct stripe_cache. */
+ +};
+ +
+ +/* A stripe cache. */
+ +struct stripe_cache {
+ +      /* Stripe hash. */
+ +      struct stripe_hash hash;
+ +
+ +      spinlock_t locks[NR_LOCKS];     /* Locks to protect lists. */
+ +
+ +      /* Stripes with io to flush, stripes to endio and LRU lists. */
+ +      struct list_head lists[SC_NR_LISTS];
+ +
+ +      /* Slab cache to allocate stripes from. */
+ +      struct {
+ +              struct kmem_cache *cache;       /* Cache itself. */
+ +              char name[32];  /* Unique name. */
+ +      } kc;
+ +
+ +      struct dm_io_client *dm_io_client; /* dm-io client resource context. */
+ +
+ +      /* dm-mem-cache client resource context. */
+ +      struct dm_mem_cache_client *mem_cache_client;
+ +
+ +      int stripes_parm;           /* # stripes parameter from constructor. */
+ +      atomic_t stripes;           /* actual # of stripes in cache. */
+ +      atomic_t stripes_to_set;    /* # of stripes to resize cache to. */
+ +      atomic_t stripes_last;      /* last # of stripes in cache. */
+ +      atomic_t active_stripes;    /* actual # of active stripes in cache. */
+ +
+ +      /* REMOVEME: */
+ +      atomic_t active_stripes_max; /* actual # of active stripes in cache. */
+ +};
+ +
+ +/* Flag specs for raid_dev */ ;
+ +enum raid_dev_flags {
+ +      DEV_FAILED,     /* Device failed. */
+ +      DEV_IO_QUEUED,  /* Io got queued to device. */
+ +};
+ +
+ +/* The raid device in a set. */
+ +struct raid_dev {
+ +      struct dm_dev *dev;
+ +      sector_t start;         /* Offset to map to. */
+ +      struct {        /* Using struct to be able to BITOPS(). */
+ +              unsigned long flags;    /* raid_dev_flags. */
+ +      } io;
+ +};
+ +
+ +BITOPS(Dev, Failed,   raid_dev, DEV_FAILED)
+ +BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED)
+ +
+ +/* Flags spec for raid_set. */
+ +enum raid_set_flags {
+ +      RS_CHECK_OVERWRITE,     /* Check for chunk overwrites. */
+ +      RS_DEAD,                /* RAID set inoperational. */
+ +      RS_DEAD_ENDIO_MESSAGE,  /* RAID set dead endio one-off message. */
+ +      RS_DEGRADED,            /* Io errors on RAID device. */
+ +      RS_DEVEL_STATS,         /* REMOVEME: display status information. */
+ +      RS_ENFORCE_PARITY_CREATION,/* Enforce parity creation. */
+ +      RS_PROHIBIT_WRITES,     /* Prohibit writes on device failure. */
+ +      RS_RECOVER,             /* Do recovery. */
+ +      RS_RECOVERY_BANDWIDTH,  /* Allow recovery bandwidth (delayed bios). */
+ +      RS_SC_BUSY,             /* Stripe cache busy -> send an event. */
+ +      RS_SUSPEND,             /* Suspend RAID set. */
+ +};
+ +
+ +/* REMOVEME: devel stats counters. */
+ +enum stats_types {
+ +      S_BIOS_READ,
+ +      S_BIOS_ADDED_READ,
+ +      S_BIOS_ENDIO_READ,
+ +      S_BIOS_WRITE,
+ +      S_BIOS_ADDED_WRITE,
+ +      S_BIOS_ENDIO_WRITE,
+ +      S_CAN_MERGE,
+ +      S_CANT_MERGE,
+ +      S_CONGESTED,
+ +      S_DM_IO_READ,
+ +      S_DM_IO_WRITE,
+ +      S_BANDWIDTH,
+ +      S_BARRIER,
+ +      S_BIO_COPY_PL_NEXT,
+ +      S_DEGRADED,
+ +      S_DELAYED_BIOS,
+ +      S_FLUSHS,
+ +      S_HITS_1ST,
+ +      S_IOS_POST,
+ +      S_INSCACHE,
+ +      S_MAX_LOOKUP,
+ +      S_CHUNK_LOCKED,
+ +      S_NO_BANDWIDTH,
+ +      S_NOT_CONGESTED,
+ +      S_NO_RW,
+ +      S_NOSYNC,
+ +      S_OVERWRITE,
+ +      S_PROHIBITCHUNKIO,
+ +      S_RECONSTRUCT_EI,
+ +      S_RECONSTRUCT_DEV,
+ +      S_RECONSTRUCT_SET,
+ +      S_RECONSTRUCTED,
+ +      S_REQUEUE,
+ +      S_STRIPE_ERROR,
+ +      S_SUM_DELAYED_BIOS,
+ +      S_XORS,
+ +      S_NR_STATS,     /* # of stats counters. Must be last! */
+ +};
+ +
+ +/* Status type -> string mappings. */
+ +struct stats_map {
+ +      const enum stats_types type;
+ +      const char *str;
+ +};
+ +
+ +static struct stats_map stats_map[] = {
+ +      { S_BIOS_READ, "r=" },
+ +      { S_BIOS_ADDED_READ, "/" },
+ +      { S_BIOS_ENDIO_READ, "/" },
+ +      { S_BIOS_WRITE, " w=" },
+ +      { S_BIOS_ADDED_WRITE, "/" },
+ +      { S_BIOS_ENDIO_WRITE, "/" },
+ +      { S_DM_IO_READ, " rc=" },
+ +      { S_DM_IO_WRITE, " wc=" },
+ +      { S_BANDWIDTH, "\nbw=" },
+ +      { S_NO_BANDWIDTH, " no_bw=" },
+ +      { S_BARRIER, "\nbarrier=" },
+ +      { S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" },
+ +      { S_CAN_MERGE, "\nmerge=" },
+ +      { S_CANT_MERGE, "/no_merge=" },
+ +      { S_CHUNK_LOCKED, "\nchunk_locked=" },
+ +      { S_CONGESTED, "\ncgst=" },
+ +      { S_NOT_CONGESTED, "/not_cgst=" },
+ +      { S_DEGRADED, "\ndegraded=" },
+ +      { S_DELAYED_BIOS, "\ndel_bios=" },
+ +      { S_SUM_DELAYED_BIOS, "/sum_del_bios=" },
+ +      { S_FLUSHS, "\nflushs=" },
+ +      { S_HITS_1ST, "\nhits_1st=" },
+ +      { S_IOS_POST, " ios_post=" },
+ +      { S_INSCACHE, " inscache=" },
+ +      { S_MAX_LOOKUP, " maxlookup=" },
+ +      { S_NO_RW, "\nno_rw=" },
+ +      { S_NOSYNC, " nosync=" },
+ +      { S_OVERWRITE, " ovr=" },
+ +      { S_PROHIBITCHUNKIO, " prhbt_io=" },
+ +      { S_RECONSTRUCT_EI, "\nrec_ei=" },
+ +      { S_RECONSTRUCT_DEV, " rec_dev=" },
+ +      { S_RECONSTRUCT_SET, " rec_set=" },
+ +      { S_RECONSTRUCTED, " rec=" },
+ +      { S_REQUEUE, " requeue=" },
+ +      { S_STRIPE_ERROR, " stripe_err=" },
+ +      { S_XORS, " xors=" },
+ +};
+ +
+ +/*
+ + * A RAID set.
+ + */
+ +#define       dm_rh_client    dm_region_hash
+ +enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT };
+ +typedef void (*xor_function_t)(unsigned count, unsigned long **data);
+ +struct raid_set {
+ +      struct dm_target *ti;   /* Target pointer. */
+ +
+ +      struct {
+ +              unsigned long flags;    /* State flags. */
+ +              struct mutex in_lock;   /* Protects central input list below. */
+ +              struct mutex xor_lock;  /* Protects xor algorithm set. */
+ +              struct bio_list in;     /* Pending ios (central input list). */
+ +              struct bio_list work;   /* ios work set. */
+ +              wait_queue_head_t suspendq;     /* suspend synchronization. */
+ +              atomic_t in_process;    /* counter of queued bios (suspendq). */
+ +              atomic_t in_process_max;/* counter of queued bios max. */
+ +
+ +              /* io work. */
+ +              struct workqueue_struct *wq;
+ +              struct delayed_work dws_do_raid;        /* For main worker. */
+ +              struct work_struct ws_do_table_event;   /* For event worker. */
+ +      } io;
+ +
+ +      /* Stripe locking abstraction. */
+ +      struct dm_raid45_locking_type *locking;
+ +
+ +      struct stripe_cache sc; /* Stripe cache for this set. */
+ +
+ +      /* Xor optimization. */
+ +      struct {
+ +              struct xor_func *f;
+ +              unsigned chunks;
+ +              unsigned speed;
+ +      } xor;
+ +
+ +      /* Recovery parameters. */
+ +      struct recover {
+ +              struct dm_dirty_log *dl;        /* Dirty log. */
+ +              struct dm_rh_client *rh;        /* Region hash. */
+ +
+ +              struct dm_io_client *dm_io_client; /* recovery dm-io client. */
+ +              /* dm-mem-cache client resource context for recovery stripes. */
+ +              struct dm_mem_cache_client *mem_cache_client;
+ +
+ +              struct list_head stripes;       /* List of recovery stripes. */
+ +
+ +              region_t nr_regions;
+ +              region_t nr_regions_to_recover;
+ +              region_t nr_regions_recovered;
+ +              unsigned long start_jiffies;
+ +              unsigned long end_jiffies;
+ +
+ +              unsigned bandwidth;      /* Recovery bandwidth [%]. */
+ +              unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
+ +              unsigned bandwidth_parm; /*  " constructor parm. */
+ +              unsigned io_size;        /* recovery io size <= region size. */
+ +              unsigned io_size_parm;   /* recovery io size ctr parameter. */
+ +              unsigned recovery;       /* Recovery allowed/prohibited. */
+ +              unsigned recovery_stripes; /* # of parallel recovery stripes. */
+ +
+ +              /* recovery io throttling. */
+ +              atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/
+ +              unsigned long last_jiffies;
+ +      } recover;
+ +
+ +      /* RAID set parameters. */
+ +      struct {
+ +              struct raid_type *raid_type;    /* RAID type (eg, RAID4). */
+ +              unsigned raid_parms;    /* # variable raid parameters. */
+ +
+ +              unsigned chunk_size;    /* Sectors per chunk. */
+ +              unsigned chunk_size_parm;
+ +              unsigned chunk_shift;   /* rsector chunk size shift. */
+ +
+ +              unsigned io_size;       /* Sectors per io. */
+ +              unsigned io_size_parm;
+ +              unsigned io_mask;       /* Mask for bio_copy_page_list(). */
+ +              unsigned io_inv_mask;   /* Mask for raid_address(). */
+ +
+ +              sector_t sectors_per_dev;       /* Sectors per device. */
+ +
+ +              atomic_t failed_devs;           /* Amount of devices failed. */
+ +
+ +              /* Index of device to initialize. */
+ +              int dev_to_init;
+ +              int dev_to_init_parm;
+ +
+ +              /* Raid devices dynamically allocated. */
+ +              unsigned raid_devs;     /* # of RAID devices below. */
+ +              unsigned data_devs;     /* # of RAID data devices. */
+ +
+ +              int ei;         /* index of failed RAID device. */
+ +
+ +              /* Index of dedicated parity device (i.e. RAID4). */
+ +              int pi;
+ +              int pi_parm;    /* constructor parm for status output. */
+ +      } set;
+ +
+ +      /* REMOVEME: devel stats counters. */
+ +      atomic_t stats[S_NR_STATS];
+ +
+ +      /* Dynamically allocated temporary pointers for xor(). */
+ +      unsigned long **data;
+ +
+ +      /* Dynamically allocated RAID devices. Alignment? */
+ +      struct raid_dev dev[0];
+ +};
+ +
+ +/* Define RAID set bit operations. */
+ +BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
+ +BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
+ +BITOPS(RS, Dead, raid_set, RS_DEAD)
+ +BITOPS(RS, DeadEndioMessage, raid_set, RS_DEAD_ENDIO_MESSAGE)
+ +BITOPS(RS, Degraded, raid_set, RS_DEGRADED)
+ +BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
+ +BITOPS(RS, EnforceParityCreation, raid_set, RS_ENFORCE_PARITY_CREATION)
+ +BITOPS(RS, ProhibitWrites, raid_set, RS_PROHIBIT_WRITES)
+ +BITOPS(RS, Recover, raid_set, RS_RECOVER)
+ +BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
+ +BITOPS(RS, Suspend, raid_set, RS_SUSPEND)
+ +#undef BITOPS
+ +
+ +/*-----------------------------------------------------------------
+ + * Raid-4/5 set structures.
+ + *---------------------------------------------------------------*/
+ +/* RAID level definitions. */
+ +enum raid_level {
+ +      raid4,
+ +      raid5,
+ +};
+ +
+ +/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
+ +enum raid_algorithm {
+ +      none,
+ +      left_asym,
+ +      right_asym,
+ +      left_sym,
+ +      right_sym,
+ +};
+ +
+ +struct raid_type {
+ +      const char *name;               /* RAID algorithm. */
+ +      const char *descr;              /* Descriptor text for logging. */
+ +      const unsigned parity_devs;     /* # of parity devices. */
+ +      const unsigned minimal_devs;    /* minimal # of devices in set. */
+ +      const enum raid_level level;            /* RAID level. */
+ +      const enum raid_algorithm algorithm;    /* RAID algorithm. */
+ +};
+ +
+ +/* Supported raid types and properties. */
+ +static struct raid_type raid_types[] = {
+ +      {"raid4",    "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
+ +      {"raid5_la", "RAID5 (left asymmetric)",       1, 3, raid5, left_asym},
+ +      {"raid5_ra", "RAID5 (right asymmetric)",      1, 3, raid5, right_asym},
+ +      {"raid5_ls", "RAID5 (left symmetric)",        1, 3, raid5, left_sym},
+ +      {"raid5_rs", "RAID5 (right symmetric)",       1, 3, raid5, right_sym},
+ +};
+ +
+ +/* Address as calculated by raid_address(). */
+ +struct raid_address {
+ +      sector_t key;           /* Hash key (address of stripe % chunk_size). */
+ +      unsigned di, pi;        /* Data and parity disks index. */
+ +};
+ +
+ +/* REMOVEME: reset statistics counters. */
+ +static void stats_reset(struct raid_set *rs)
+ +{
+ +      unsigned s = S_NR_STATS;
+ +
+ +      while (s--)
+ +              atomic_set(rs->stats + s, 0);
+ +}
+ +
+ +/*----------------------------------------------------------------
+ + * RAID set management routines.
+ + *--------------------------------------------------------------*/
+ +/*
+ + * Begin small helper functions.
+ + */
+ +/* No need to be called from region hash indirectly at dm_rh_dec(). */
+ +static void wake_dummy(void *context) {}
+ +
+ +/* Return # of io reference. */
+ +static int io_ref(struct raid_set *rs)
+ +{
+ +      return atomic_read(&rs->io.in_process);
+ +}
+ +
+ +/* Get an io reference. */
+ +static void io_get(struct raid_set *rs)
+ +{
+ +      int p = atomic_inc_return(&rs->io.in_process);
+ +
+ +      if (p > atomic_read(&rs->io.in_process_max))
+ +              atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
+ +}
+ +
+ +/* Put the io reference and conditionally wake io waiters. */
+ +static void io_put(struct raid_set *rs)
+ +{
+ +      /* Intel: rebuild data corrupter? */
+ +      if (atomic_dec_and_test(&rs->io.in_process))
+ +              wake_up(&rs->io.suspendq);
+ +      else
+ +              BUG_ON(io_ref(rs) < 0);
+ +}
+ +
+ +/* Wait until all io has been processed. */
+ +static void wait_ios(struct raid_set *rs)
+ +{
+ +      wait_event(rs->io.suspendq, !io_ref(rs));
+ +}
+ +
+ +/* Queue (optionally delayed) io work. */
+ +static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
+ +{
+ +      queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay);
+ +}
+ +
+ +/* Queue io work immediately (called from region hash too). */
+ +static void wake_do_raid(void *context)
+ +{
+ +      struct raid_set *rs = context;
+ +
+ +      queue_work(rs->io.wq, &rs->io.dws_do_raid.work);
+ +}
+ +
+ +/* Calculate device sector offset. */
+ +static sector_t _sector(struct raid_set *rs, struct bio *bio)
+ +{
+ +      sector_t sector = bio->bi_sector;
+ +
+ +      sector_div(sector, rs->set.data_devs);
+ +      return sector;
+ +}
+ +
+ +/* Return # of active stripes in stripe cache. */
+ +static int sc_active(struct stripe_cache *sc)
+ +{
+ +      return atomic_read(&sc->active_stripes);
+ +}
+ +
+ +/* Stripe cache busy indicator. */
+ +static int sc_busy(struct raid_set *rs)
+ +{
+ +      return sc_active(&rs->sc) >
+ +             atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2);
+ +}
+ +
+ +/* Set chunks states. */
+ +enum chunk_dirty_type { CLEAN, DIRTY, ERROR };
+ +static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type)
+ +{
+ +      switch (type) {
+ +      case CLEAN:
+ +              ClearChunkDirty(chunk);
+ +              break;
+ +      case DIRTY:
+ +              SetChunkDirty(chunk);
+ +              break;
+ +      case ERROR:
+ +              SetChunkError(chunk);
+ +              SetStripeError(chunk->stripe);
+ +              return;
+ +      default:
+ +              BUG();
+ +      }
+ +
+ +      SetChunkUptodate(chunk);
+ +      SetChunkIo(chunk);
+ +      ClearChunkError(chunk);
+ +}
+ +
+ +/* Return region state for a sector. */
+ +static int region_state(struct raid_set *rs, sector_t sector,
+ +                      enum dm_rh_region_states state)
+ +{
+ +      struct dm_rh_client *rh = rs->recover.rh;
+ +      region_t region = dm_rh_sector_to_region(rh, sector);
+ +
+ +      return !!(dm_rh_get_state(rh, region, 1) & state);
+ +}
+ +
+ +/*
+ + * Return true in case a chunk should be read/written
+ + *
+ + * Conditions to read/write:
+ + *    o chunk not uptodate
+ + *    o chunk dirty
+ + *
+ + * Conditios to avoid io:
+ + *    o io already ongoing on chunk
+ + *    o io explitely prohibited
+ + */
+ +static int chunk_io(struct stripe_chunk *chunk)
+ +{
+ +      /* 2nd run optimization (flag set below on first run). */
+ +      if (TestClearChunkMustIo(chunk))
+ +              return 1;
+ +
+ +      /* Avoid io if prohibited or a locked chunk. */
+ +      if (!ChunkIo(chunk) || ChunkLocked(chunk))
+ +              return 0;
+ +
+ +      if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) {
+ +              SetChunkMustIo(chunk); /* 2nd run optimization. */
+ +              return 1;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Call a function on each chunk needing io unless device failed. */
+ +static unsigned for_each_io_dev(struct stripe *stripe,
+ +                              void (*f_io)(struct stripe *stripe, unsigned p))
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      unsigned p, r = 0;
+ +
+ +      for (p = 0; p < rs->set.raid_devs; p++) {
+ +              if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) {
+ +                      f_io(stripe, p);
+ +                      r++;
+ +              }
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +/*
+ + * Index of device to calculate parity on.
+ + *
+ + * Either the parity device index *or* the selected
+ + * device to init after a spare replacement.
+ + */
+ +static int dev_for_parity(struct stripe *stripe, int *sync)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING);
+ +
+ +      *sync = !r;
+ +
+ +      /* Reconstruct a particular device ?. */
+ +      if (r && rs->set.dev_to_init > -1)
+ +              return rs->set.dev_to_init;
+ +      else if (rs->set.raid_type->level == raid4)
+ +              return rs->set.pi;
+ +      else if (!StripeRecover(stripe))
+ +              return stripe->idx.parity;
+ +      else
+ +              return -1;
+ +}
+ +
+ +/* RAID set congested function. */
+ +static int rs_congested(void *congested_data, int bdi_bits)
+ +{
+ +      int r;
+ +      unsigned p;
+ +      struct raid_set *rs = congested_data;
+ +
+ +      if (sc_busy(rs) || RSSuspend(rs) || RSProhibitWrites(rs))
+ +              r = 1;
+ +      else for (r = 0, p = rs->set.raid_devs; !r && p--; ) {
+ +              /* If any of our component devices are overloaded. */
+ +              struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
+ +
+ +              r |= bdi_congested(&q->backing_dev_info, bdi_bits);
+ +      }
+ +
+ +      /* REMOVEME: statistics. */
+ +      atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
+ +      return r;
+ +}
+ +
+ +/* RAID device degrade check. */
+ +static void rs_check_degrade_dev(struct raid_set *rs,
+ +                               struct stripe *stripe, unsigned p)
+ +{
+ +      if (TestSetDevFailed(rs->dev + p))
+ +              return;
+ +
+ +      /* Through an event in case of member device errors. */
+ +      if ((atomic_inc_return(&rs->set.failed_devs) >
+ +           rs->set.raid_type->parity_devs) &&
+ +           !TestSetRSDead(rs)) {
+ +              /* Display RAID set dead message once. */
+ +              unsigned p;
+ +              char buf[BDEVNAME_SIZE];
+ +
+ +              DMERR("FATAL: too many devices failed -> RAID set broken");
+ +              for (p = 0; p < rs->set.raid_devs; p++) {
+ +                      if (DevFailed(rs->dev + p))
+ +                              DMERR("device /dev/%s failed",
+ +                                    bdevname(rs->dev[p].dev->bdev, buf));
+ +              }
+ +      }
+ +
+ +      /* Only log the first member error. */
+ +      if (!TestSetRSDegraded(rs)) {
+ +              char buf[BDEVNAME_SIZE];
+ +
+ +              /* Store index for recovery. */
+ +              rs->set.ei = p;
+ +              DMERR("CRITICAL: %sio error on device /dev/%s "
+ +                    "in region=%llu; DEGRADING RAID set\n",
+ +                    stripe ? "" : "FAKED ",
+ +                    bdevname(rs->dev[p].dev->bdev, buf),
+ +                    (unsigned long long) (stripe ? stripe->key : 0));
+ +              DMERR("further device error messages suppressed");
+ +      }
+ +
+ +      /* Prohibit further writes to allow for userpace to update metadata. */
+ +      SetRSProhibitWrites(rs);
+ +      schedule_work(&rs->io.ws_do_table_event);
+ +}
+ +
+ +/* RAID set degrade check. */
+ +static void rs_check_degrade(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      unsigned p = rs->set.raid_devs;
+ +
+ +      while (p--) {
+ +              if (ChunkError(CHUNK(stripe, p)))
+ +                      rs_check_degrade_dev(rs, stripe, p);
+ +      }
+ +}
+ +
+ +/* Lookup a RAID device by name or by major:minor number. */
+ +static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup)
+ +{
+ +      unsigned p;
+ +      struct raid_dev *dev;
+ +
+ +      /*
+ +       * Must be an incremental loop, because the device array
+ +       * can have empty slots still on calls from raid_ctr()
+ +       */
+ +      for (dev = rs->dev, p = 0;
+ +           dev->dev && p < rs->set.raid_devs;
+ +           dev++, p++) {
+ +              if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev)
+ +                      return p;
+ +      }
+ +
+ +      return -ENODEV;
+ +}
+ +/*
+ + * End small helper functions.
+ + */
+ +
+ +/*
+ + * Stripe hash functions
+ + */
+ +/* Initialize/destroy stripe hash. */
+ +static int hash_init(struct stripe_hash *hash, unsigned stripes)
+ +{
+ +      unsigned buckets = roundup_pow_of_two(stripes >> 1);
+ +      static unsigned hash_primes[] = {
+ +              /* Table of primes for hash_fn/table size optimization. */
+ +              1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
+ +              1543, 3079, 6151, 12289, 24593, 49157, 98317,
+ +      };
+ +
+ +      /* Allocate stripe hash buckets. */
+ +      hash->hash = vmalloc(buckets * sizeof(*hash->hash));
+ +      if (!hash->hash)
+ +              return -ENOMEM;
+ +
+ +      hash->buckets = buckets;
+ +      hash->mask = buckets - 1;
+ +      hash->shift = ffs(buckets);
+ +      if (hash->shift > ARRAY_SIZE(hash_primes))
+ +              hash->shift = ARRAY_SIZE(hash_primes) - 1;
+ +
+ +      BUG_ON(hash->shift < 2);
+ +      hash->prime = hash_primes[hash->shift];
+ +
+ +      /* Initialize buckets. */
+ +      while (buckets--)
+ +              INIT_LIST_HEAD(hash->hash + buckets);
+ +      return 0;
+ +}
+ +
+ +static void hash_exit(struct stripe_hash *hash)
+ +{
+ +      if (hash->hash) {
+ +              vfree(hash->hash);
+ +              hash->hash = NULL;
+ +      }
+ +}
+ +
+ +static unsigned hash_fn(struct stripe_hash *hash, sector_t key)
+ +{
+ +      return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
+ +}
+ +
+ +static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key)
+ +{
+ +      return hash->hash + hash_fn(hash, key);
+ +}
+ +
+ +/* Insert an entry into a hash. */
+ +static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe)
+ +{
+ +      list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
+ +}
+ +
+ +/* Lookup an entry in the stripe hash. */
+ +static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key)
+ +{
+ +      unsigned look = 0;
+ +      struct stripe *stripe;
+ +      struct list_head *bucket = hash_bucket(&sc->hash, key);
+ +
+ +      list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
+ +              look++;
+ +
+ +              if (stripe->key == key) {
+ +                      /* REMOVEME: statisics. */
+ +                      if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
+ +                              atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look);
+ +                      return stripe;
+ +              }
+ +      }
+ +
+ +      return NULL;
+ +}
+ +
+ +/* Resize the stripe cache hash on size changes. */
+ +static int sc_hash_resize(struct stripe_cache *sc)
+ +{
+ +      /* Resize indicated ? */
+ +      if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) {
+ +              int r;
+ +              struct stripe_hash hash;
+ +
+ +              r = hash_init(&hash, atomic_read(&sc->stripes));
+ +              if (r)
+ +                      return r;
+ +
+ +              if (sc->hash.hash) {
+ +                      unsigned b = sc->hash.buckets;
+ +                      struct list_head *pos, *tmp;
+ +
+ +                      /* Walk old buckets and insert into new. */
+ +                      while (b--) {
+ +                              list_for_each_safe(pos, tmp, sc->hash.hash + b)
+ +                                  stripe_insert(&hash,
+ +                                                list_entry(pos, struct stripe,
+ +                                                           lists[LIST_HASH]));
+ +                      }
+ +
+ +              }
+ +
+ +              hash_exit(&sc->hash);
+ +              memcpy(&sc->hash, &hash, sizeof(sc->hash));
+ +              atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
+ +      }
+ +
+ +      return 0;
+ +}
+ +/* End hash stripe hash function. */
+ +
+ +/* List add, delete, push and pop functions. */
+ +/* Add stripe to flush list. */
+ +#define       DEL_LIST(lh) \
+ +      if (!list_empty(lh)) \
+ +              list_del_init(lh);
+ +
+ +/* Delete stripe from hash. */
+ +static void stripe_hash_del(struct stripe *stripe)
+ +{
+ +      DEL_LIST(stripe->lists + LIST_HASH);
+ +}
+ +
+ +/* Return stripe reference count. */
+ +static inline int stripe_ref(struct stripe *stripe)
+ +{
+ +      return atomic_read(&stripe->cnt);
+ +}
+ +
+ +static void stripe_flush_add(struct stripe *stripe)
+ +{
+ +      struct stripe_cache *sc = stripe->sc;
+ +      struct list_head *lh = stripe->lists + LIST_FLUSH;
+ +
+ +      if (!StripeReconstruct(stripe) && list_empty(lh))
+ +              list_add_tail(lh, sc->lists + LIST_FLUSH);
+ +}
+ +
+ +/*
+ + * Add stripe to LRU (inactive) list.
+ + *
+ + * Need lock, because of concurrent access from message interface.
+ + */
+ +static void stripe_lru_add(struct stripe *stripe)
+ +{
+ +      if (!StripeRecover(stripe)) {
+ +              struct list_head *lh = stripe->lists + LIST_LRU;
+ +
+ +              if (list_empty(lh))
+ +                      list_add_tail(lh, stripe->sc->lists + LIST_LRU);
+ +      }
+ +}
+ +
+ +#define POP_LIST(list) \
+ +      do { \
+ +              if (list_empty(sc->lists + (list))) \
+ +                      stripe = NULL; \
+ +              else { \
+ +                      stripe = list_first_entry(sc->lists + (list), \
+ +                                                struct stripe, \
+ +                                                lists[(list)]); \
+ +                      list_del_init(stripe->lists + (list)); \
+ +              } \
+ +      } while (0);
+ +
+ +/* Pop an available stripe off the LRU list. */
+ +static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
+ +{
+ +      struct stripe *stripe;
+ +
+ +      POP_LIST(LIST_LRU);
+ +      return stripe;
+ +}
+ +
+ +/* Pop an available stripe off the io list. */
+ +static struct stripe *stripe_io_pop(struct stripe_cache *sc)
+ +{
+ +      struct stripe *stripe;
+ +
+ +      POP_LIST(LIST_FLUSH);
+ +      return stripe;
+ +}
+ +
+ +/* Push a stripe safely onto the endio list to be handled by do_endios(). */
+ +static void stripe_endio_push(struct stripe *stripe)
+ +{
+ +      unsigned long flags;
+ +      struct stripe_cache *sc = stripe->sc;
+ +      struct list_head *stripe_list = stripe->lists + LIST_ENDIO,
+ +                       *sc_list = sc->lists + LIST_ENDIO;
+ +      spinlock_t *lock = sc->locks + LOCK_ENDIO;
+ +
+ +      /* This runs in parallel with do_endios(). */
+ +      spin_lock_irqsave(lock, flags);
+ +      if (list_empty(stripe_list))
+ +              list_add_tail(stripe_list, sc_list);
+ +      spin_unlock_irqrestore(lock, flags);
+ +
+ +      wake_do_raid(RS(sc)); /* Wake myself. */
+ +}
+ +
+ +/* Pop a stripe off safely off the endio list. */
+ +static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
+ +{
+ +      struct stripe *stripe;
+ +      spinlock_t *lock = sc->locks + LOCK_ENDIO;
+ +
+ +      /* This runs in parallel with endio(). */
+ +      spin_lock_irq(lock);
+ +      POP_LIST(LIST_ENDIO)
+ +      spin_unlock_irq(lock);
+ +      return stripe;
+ +}
+ +#undef POP_LIST
+ +
+ +/*
+ + * Stripe cache locking functions
+ + */
+ +/* Dummy lock function for single host RAID4+5. */
+ +static void *no_lock(sector_t key, enum dm_lock_type type)
+ +{
+ +      return &no_lock;
+ +}
+ +
+ +/* Dummy unlock function for single host RAID4+5. */
+ +static void no_unlock(void *lock_handle)
+ +{
+ +}
+ +
+ +/* No locking (for single host RAID 4+5). */
+ +static struct dm_raid45_locking_type locking_none = {
+ +      .lock = no_lock,
+ +      .unlock = no_unlock,
+ +};
+ +
+ +/* Lock a stripe (for clustering). */
+ +static int
+ +stripe_lock(struct stripe *stripe, int rw, sector_t key)
+ +{
+ +      stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX);
+ +      return stripe->lock ? 0 : -EPERM;
+ +}
+ +
+ +/* Unlock a stripe (for clustering). */
+ +static void stripe_unlock(struct stripe *stripe)
+ +{
+ +      RS(stripe->sc)->locking->unlock(stripe->lock);
+ +      stripe->lock = NULL;
+ +}
+ +
+ +/* Test io pending on stripe. */
+ +static int stripe_io_ref(struct stripe *stripe)
+ +{
+ +      return atomic_read(&stripe->io.pending);
+ +}
+ +
+ +static void stripe_io_get(struct stripe *stripe)
+ +{
+ +      if (atomic_inc_return(&stripe->io.pending) == 1)
+ +              /* REMOVEME: statistics */
+ +              atomic_inc(&stripe->sc->active_stripes);
+ +      else
+ +              BUG_ON(stripe_io_ref(stripe) < 0);
+ +}
+ +
+ +static void stripe_io_put(struct stripe *stripe)
+ +{
+ +      if (atomic_dec_and_test(&stripe->io.pending)) {
+ +              if (unlikely(StripeRecover(stripe)))
+ +                      /* Don't put recovery stripe on endio list. */
+ +                      wake_do_raid(RS(stripe->sc));
+ +              else
+ +                      /* Add regular stripe to endio list and wake daemon. */
+ +                      stripe_endio_push(stripe);
+ +
+ +              /* REMOVEME: statistics */
+ +              atomic_dec(&stripe->sc->active_stripes);
+ +      } else
+ +              BUG_ON(stripe_io_ref(stripe) < 0);
+ +}
+ +
+ +/* Take stripe reference out. */
+ +static int stripe_get(struct stripe *stripe)
+ +{
+ +      int r;
+ +      struct list_head *lh = stripe->lists + LIST_LRU;
+ +
+ +      /* Delete stripe from LRU (inactive) list if on. */
+ +      DEL_LIST(lh);
+ +      BUG_ON(stripe_ref(stripe) < 0);
+ +
+ +      /* Lock stripe on first reference */
+ +      r = (atomic_inc_return(&stripe->cnt) == 1) ?
+ +          stripe_lock(stripe, WRITE, stripe->key) : 0;
+ +
+ +      return r;
+ +}
+ +#undef DEL_LIST
+ +
+ +/* Return references on a chunk. */
+ +static int chunk_ref(struct stripe_chunk *chunk)
+ +{
+ +      return atomic_read(&chunk->cnt);
+ +}
+ +
+ +/* Take out reference on a chunk. */
+ +static int chunk_get(struct stripe_chunk *chunk)
+ +{
+ +      return atomic_inc_return(&chunk->cnt);
+ +}
+ +
+ +/* Drop reference on a chunk. */
+ +static void chunk_put(struct stripe_chunk *chunk)
+ +{
+ +      BUG_ON(atomic_dec_return(&chunk->cnt) < 0);
+ +}
+ +
+ +/*
+ + * Drop reference on a stripe.
+ + *
+ + * Move it to list of LRU stripes if zero.
+ + */
+ +static void stripe_put(struct stripe *stripe)
+ +{
+ +      if (atomic_dec_and_test(&stripe->cnt)) {
+ +              BUG_ON(stripe_io_ref(stripe));
+ +              stripe_unlock(stripe);
+ +      } else
+ +              BUG_ON(stripe_ref(stripe) < 0);
+ +}
+ +
+ +/* Helper needed by for_each_io_dev(). */
+ +static void stripe_get_references(struct stripe *stripe, unsigned p)
+ +{
+ +
+ +      /*
+ +       * Another one to reference the stripe in
+ +       * order to protect vs. LRU list moves.
+ +       */
+ +      io_get(RS(stripe->sc)); /* Global io references. */
+ +      stripe_get(stripe);
+ +      stripe_io_get(stripe);  /* One for each chunk io. */
+ +}
+ +
+ +/* Helper for endio() to put all take references. */
+ +static void stripe_put_references(struct stripe *stripe)
+ +{
+ +      stripe_io_put(stripe);  /* One for each chunk io. */
+ +      stripe_put(stripe);
+ +      io_put(RS(stripe->sc));
+ +}
+ +
+ +/*
+ + * Stripe cache functions.
+ + */
+ +/*
+ + * Invalidate all chunks (i.e. their pages)  of a stripe.
+ + *
+ + * I only keep state for the whole chunk.
+ + */
+ +static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk)
+ +{
+ +      chunk->io.flags = 0;
+ +}
+ +
+ +static void
+ +stripe_chunks_invalidate(struct stripe *stripe)
+ +{
+ +      unsigned p = RS(stripe->sc)->set.raid_devs;
+ +
+ +      while (p--)
+ +              stripe_chunk_invalidate(CHUNK(stripe, p));
+ +}
+ +
+ +/* Prepare stripe for (re)use. */
+ +static void stripe_invalidate(struct stripe *stripe)
+ +{
+ +      stripe->io.flags = 0;
+ +      stripe->idx.parity = stripe->idx.recover = -1;
+ +      stripe_chunks_invalidate(stripe);
+ +}
+ +
+ +/*
+ + * Allow io on all chunks of a stripe.
+ + * If not set, IO will not occur; i.e. it's prohibited.
+ + *
+ + * Actual IO submission for allowed chunks depends
+ + * on their !uptodate or dirty state.
+ + */
+ +static void stripe_allow_io(struct stripe *stripe)
+ +{
+ +      unsigned p = RS(stripe->sc)->set.raid_devs;
+ +
+ +      while (p--)
+ +              SetChunkIo(CHUNK(stripe, p));
+ +}
+ +
+ +/* Initialize a stripe. */
+ +static void stripe_init(struct stripe_cache *sc, struct stripe *stripe)
+ +{
+ +      unsigned i, p = RS(sc)->set.raid_devs;
+ +
+ +      /* Work all io chunks. */
+ +      while (p--) {
+ +              struct stripe_chunk *chunk = CHUNK(stripe, p);
+ +
+ +              atomic_set(&chunk->cnt, 0);
+ +              chunk->stripe = stripe;
+ +              i = ARRAY_SIZE(chunk->bl);
+ +              while (i--)
+ +                      bio_list_init(chunk->bl + i);
+ +      }
+ +
+ +      stripe->sc = sc;
+ +
+ +      i = ARRAY_SIZE(stripe->lists);
+ +      while (i--)
+ +              INIT_LIST_HEAD(stripe->lists + i);
+ +
+ +      stripe->io.size = RS(sc)->set.io_size;
+ +      atomic_set(&stripe->cnt, 0);
+ +      atomic_set(&stripe->io.pending, 0);
+ +      stripe_invalidate(stripe);
+ +}
+ +
+ +/* Number of pages per chunk. */
+ +static inline unsigned chunk_pages(unsigned sectors)
+ +{
+ +      return dm_div_up(sectors, SECTORS_PER_PAGE);
+ +}
+ +
+ +/* Number of pages per stripe. */
+ +static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
+ +{
+ +      return chunk_pages(io_size) * rs->set.raid_devs;
+ +}
+ +
+ +/* Initialize part of page_list (recovery). */
+ +static void stripe_zero_pl_part(struct stripe *stripe, int p,
+ +                              unsigned start, unsigned count)
+ +{
+ +      unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count);
+ +      /* Get offset into the page_list. */
+ +      struct page_list *pl = pl_elem(PL(stripe, p), o);
+ +
+ +      BUG_ON(!pl);
+ +      while (pl && pages--) {
+ +              BUG_ON(!pl->page);
+ +              memset(page_address(pl->page), 0, PAGE_SIZE);
+ +              pl = pl->next;
+ +      }
+ +}
+ +
+ +/* Initialize parity chunk of stripe. */
+ +static void stripe_zero_chunk(struct stripe *stripe, int p)
+ +{
+ +      if (p > -1)
+ +              stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
+ +}
+ +
+ +/* Return dynamic stripe structure size. */
+ +static size_t stripe_size(struct raid_set *rs)
+ +{
+ +      return sizeof(struct stripe) +
+ +                    rs->set.raid_devs * sizeof(struct stripe_chunk);
+ +}
+ +
+ +/* Allocate a stripe and its memory object. */
+ +/* XXX adjust to cope with stripe cache and recovery stripe caches. */
+ +enum grow { SC_GROW, SC_KEEP };
+ +static struct stripe *stripe_alloc(struct stripe_cache *sc,
+ +                                 struct dm_mem_cache_client *mc,
+ +                                 enum grow grow)
+ +{
+ +      int r;
+ +      struct stripe *stripe;
+ +
+ +      stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
+ +      if (stripe) {
+ +              /* Grow the dm-mem-cache by one object. */
+ +              if (grow == SC_GROW) {
+ +                      r = dm_mem_cache_grow(mc, 1);
+ +                      if (r)
+ +                              goto err_free;
+ +              }
+ +
+ +              stripe->obj = dm_mem_cache_alloc(mc);
+ +              if (IS_ERR(stripe->obj))
+ +                      goto err_shrink;
+ +
+ +              stripe_init(sc, stripe);
+ +      }
+ +
+ +      return stripe;
+ +
+ +err_shrink:
+ +      if (grow == SC_GROW)
+ +              dm_mem_cache_shrink(mc, 1);
+ +err_free:
+ +      kmem_cache_free(sc->kc.cache, stripe);
+ +      return NULL;
+ +}
+ +
+ +/*
+ + * Free a stripes memory object, shrink the
+ + * memory cache and free the stripe itself.
+ + */
+ +static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
+ +{
+ +      dm_mem_cache_free(mc, stripe->obj);
+ +      dm_mem_cache_shrink(mc, 1);
+ +      kmem_cache_free(stripe->sc->kc.cache, stripe);
+ +}
+ +
+ +/* Free the recovery stripe. */
+ +static void stripe_recover_free(struct raid_set *rs)
+ +{
+ +      struct recover *rec = &rs->recover;
+ +      struct dm_mem_cache_client *mc;
+ +
+ +      mc = rec->mem_cache_client;
+ +      rec->mem_cache_client = NULL;
+ +      if (mc) {
+ +              struct stripe *stripe;
+ +
+ +              while (!list_empty(&rec->stripes)) {
+ +                      stripe = list_first_entry(&rec->stripes, struct stripe,
+ +                                                lists[LIST_RECOVER]);
+ +                      list_del(stripe->lists + LIST_RECOVER);
+ +                      kfree(stripe->recover);
+ +                      stripe_free(stripe, mc);
+ +              }
+ +
+ +              dm_mem_cache_client_destroy(mc);
+ +              dm_io_client_destroy(rec->dm_io_client);
+ +              rec->dm_io_client = NULL;
+ +      }
+ +}
+ +
+ +/* Grow stripe cache. */
+ +static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
+ +{
+ +      int r = 0;
+ +
+ +      /* Try to allocate this many (additional) stripes. */
+ +      while (stripes--) {
+ +              struct stripe *stripe =
+ +                      stripe_alloc(sc, sc->mem_cache_client, grow);
+ +
+ +              if (likely(stripe)) {
+ +                      stripe_lru_add(stripe);
+ +                      atomic_inc(&sc->stripes);
+ +              } else {
+ +                      r = -ENOMEM;
+ +                      break;
+ +              }
+ +      }
+ +
+ +      return r ? r : sc_hash_resize(sc);
+ +}
+ +
+ +/* Shrink stripe cache. */
+ +static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
+ +{
+ +      int r = 0;
+ +
+ +      /* Try to get unused stripe from LRU list. */
+ +      while (stripes--) {
+ +              struct stripe *stripe;
+ +
+ +              stripe = stripe_lru_pop(sc);
+ +              if (stripe) {
+ +                      /* An LRU stripe may never have ios pending! */
+ +                      BUG_ON(stripe_io_ref(stripe));
+ +                      BUG_ON(stripe_ref(stripe));
+ +                      atomic_dec(&sc->stripes);
+ +                      /* Remove from hash if on before deletion. */
+ +                      stripe_hash_del(stripe);
+ +                      stripe_free(stripe, sc->mem_cache_client);
+ +              } else {
+ +                      r = -ENOENT;
+ +                      break;
+ +              }
+ +      }
+ +
+ +      /* Check if stats are still sane. */
+ +      if (atomic_read(&sc->active_stripes_max) >
+ +          atomic_read(&sc->stripes))
+ +              atomic_set(&sc->active_stripes_max, 0);
+ +
+ +      if (r)
+ +              return r;
+ +
+ +      return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0;
+ +}
+ +
+ +/* Create stripe cache and recovery. */
+ +static int sc_init(struct raid_set *rs, unsigned stripes)
+ +{
+ +      unsigned i, r, rstripes;
+ +      struct stripe_cache *sc = &rs->sc;
+ +      struct stripe *stripe;
+ +      struct recover *rec = &rs->recover;
+ +      struct mapped_device *md;
+ +      struct gendisk *disk;
+ +
+ +
+ +      /* Initialize lists and locks. */
+ +      i = ARRAY_SIZE(sc->lists);
+ +      while (i--)
+ +              INIT_LIST_HEAD(sc->lists + i);
+ +
+ +      INIT_LIST_HEAD(&rec->stripes);
+ +
+ +      /* Initialize endio and LRU list locks. */
+ +      i = NR_LOCKS;
+ +      while (i--)
+ +              spin_lock_init(sc->locks + i);
+ +
+ +      /* Initialize atomic variables. */
+ +      atomic_set(&sc->stripes, 0);
+ +      atomic_set(&sc->stripes_to_set, 0);
+ +      atomic_set(&sc->active_stripes, 0);
+ +      atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */
+ +
+ +      /*
+ +       * We need a runtime unique # to suffix the kmem cache name
+ +       * because we'll have one for each active RAID set.
+ +       */
+ +      md = dm_table_get_md(rs->ti->table);
+ +      disk = dm_disk(md);
+ +      snprintf(sc->kc.name, sizeof(sc->kc.name), "%s-%d.%d", TARGET,
+ +               disk->first_minor, atomic_inc_return(&_stripe_sc_nr));
+ +      sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
+ +                                       0, 0, NULL);
+ +      if (!sc->kc.cache)
+ +              return -ENOMEM;
+ +
+ +      /* Create memory cache client context for RAID stripe cache. */
+ +      sc->mem_cache_client =
+ +              dm_mem_cache_client_create(stripes, rs->set.raid_devs,
+ +                                         chunk_pages(rs->set.io_size));
+ +      if (IS_ERR(sc->mem_cache_client))
+ +              return PTR_ERR(sc->mem_cache_client);
+ +
+ +      /* Create memory cache client context for RAID recovery stripe(s). */
+ +      rstripes = rec->recovery_stripes;
+ +      rec->mem_cache_client =
+ +              dm_mem_cache_client_create(rstripes, rs->set.raid_devs,
+ +                                         chunk_pages(rec->io_size));
+ +      if (IS_ERR(rec->mem_cache_client))
+ +              return PTR_ERR(rec->mem_cache_client);
+ +
+ +      /* Create dm-io client context for IO stripes. */
-       sc->dm_io_client = dm_io_client_create();
++      sc->dm_io_client =
++              dm_io_client_create((stripes > 32 ? 32 : stripes) *
++                                  rs->set.raid_devs *
++                                  chunk_pages(rs->set.io_size));
+ +      if (IS_ERR(sc->dm_io_client))
+ +              return PTR_ERR(sc->dm_io_client);
+ +
+ +      /* FIXME: intermingeled with stripe cache initialization. */
+ +      /* Create dm-io client context for recovery stripes. */
-       rec->dm_io_client = dm_io_client_create();
++      rec->dm_io_client =
++              dm_io_client_create(rstripes * rs->set.raid_devs *
++                                  chunk_pages(rec->io_size));
+ +      if (IS_ERR(rec->dm_io_client))
+ +              return PTR_ERR(rec->dm_io_client);
+ +
+ +      /* Allocate stripes for set recovery. */
+ +      while (rstripes--) {
+ +              stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
+ +              if (!stripe)
+ +                      return -ENOMEM;
+ +
+ +              stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL);
+ +              if (!stripe->recover) {
+ +                      stripe_free(stripe, rec->mem_cache_client);
+ +                      return -ENOMEM;
+ +              }
+ +
+ +              SetStripeRecover(stripe);
+ +              stripe->io.size = rec->io_size;
+ +              list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes);
+ +              /* Don't add recovery stripes to LRU list! */
+ +      }
+ +
+ +      /*
+ +       * Allocate the stripe objetcs from the
+ +       * cache and add them to the LRU list.
+ +       */
+ +      r = sc_grow(sc, stripes, SC_KEEP);
+ +      if (!r)
+ +              atomic_set(&sc->stripes_last, stripes);
+ +
+ +      return r;
+ +}
+ +
+ +/* Destroy the stripe cache. */
+ +static void sc_exit(struct stripe_cache *sc)
+ +{
+ +      struct raid_set *rs = RS(sc);
+ +
+ +      if (sc->kc.cache) {
+ +              stripe_recover_free(rs);
+ +              BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
+ +              kmem_cache_destroy(sc->kc.cache);
+ +              sc->kc.cache = NULL;
+ +
+ +              if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client))
+ +                      dm_mem_cache_client_destroy(sc->mem_cache_client);
+ +
+ +              if (sc->dm_io_client && !IS_ERR(sc->dm_io_client))
+ +                      dm_io_client_destroy(sc->dm_io_client);
+ +
+ +              hash_exit(&sc->hash);
+ +      }
+ +}
+ +
+ +/*
+ + * Calculate RAID address
+ + *
+ + * Delivers tuple with the index of the data disk holding the chunk
+ + * in the set, the parity disks index and the start of the stripe
+ + * within the address space of the set (used as the stripe cache hash key).
+ + */
+ +/* thx MD. */
+ +static struct raid_address *raid_address(struct raid_set *rs, sector_t sector,
+ +                                       struct raid_address *addr)
+ +{
+ +      sector_t stripe, tmp;
+ +
+ +      /*
+ +       * chunk_number = sector / chunk_size
+ +       * stripe_number = chunk_number / data_devs
+ +       * di = stripe % data_devs;
+ +       */
+ +      stripe = sector >> rs->set.chunk_shift;
+ +      addr->di = sector_div(stripe, rs->set.data_devs);
+ +
+ +      switch (rs->set.raid_type->level) {
+ +      case raid4:
+ +              addr->pi = rs->set.pi;
+ +              goto check_shift_di;
+ +      case raid5:
+ +              tmp = stripe;
+ +              addr->pi = sector_div(tmp, rs->set.raid_devs);
+ +
+ +              switch (rs->set.raid_type->algorithm) {
+ +              case left_asym:         /* Left asymmetric. */
+ +                      addr->pi = rs->set.data_devs - addr->pi;
+ +              case right_asym:        /* Right asymmetric. */
+ +check_shift_di:
+ +                      if (addr->di >= addr->pi)
+ +                              addr->di++;
+ +                      break;
+ +              case left_sym:          /* Left symmetric. */
+ +                      addr->pi = rs->set.data_devs - addr->pi;
+ +              case right_sym:         /* Right symmetric. */
+ +                      addr->di = (addr->pi + addr->di + 1) %
+ +                                 rs->set.raid_devs;
+ +                      break;
+ +              case none: /* Ain't happen: RAID4 algorithm placeholder. */
+ +                      BUG();
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Start offset of the stripes chunk on any single device of the RAID
+ +       * set, adjusted in case io size differs from chunk size.
+ +       */
+ +      addr->key = (stripe << rs->set.chunk_shift) +
+ +                  (sector & rs->set.io_inv_mask);
+ +      return addr;
+ +}
+ +
+ +/*
+ + * Copy data across between stripe pages and bio vectors.
+ + *
+ + * Pay attention to data alignment in stripe and bio pages.
+ + */
+ +static void bio_copy_page_list(int rw, struct stripe *stripe,
+ +                             struct page_list *pl, struct bio *bio)
+ +{
+ +      unsigned i, page_offset;
+ +      void *page_addr;
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      struct bio_vec *bv;
+ +
+ +      /* Get start page in page list for this sector. */
+ +      i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
+ +      pl = pl_elem(pl, i);
+ +      BUG_ON(!pl);
+ +      BUG_ON(!pl->page);
+ +
+ +      page_addr = page_address(pl->page);
+ +      page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
+ +
+ +      /* Walk all segments and copy data across between bio_vecs and pages. */
+ +      bio_for_each_segment(bv, bio, i) {
+ +              int len = bv->bv_len, size;
+ +              unsigned bio_offset = 0;
+ +              void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
+ +redo:
+ +              size = (page_offset + len > PAGE_SIZE) ?
+ +                     PAGE_SIZE - page_offset : len;
+ +
+ +              if (rw == READ)
+ +                      memcpy(bio_addr + bio_offset,
+ +                             page_addr + page_offset, size);
+ +              else
+ +                      memcpy(page_addr + page_offset,
+ +                             bio_addr + bio_offset, size);
+ +
+ +              page_offset += size;
+ +              if (page_offset == PAGE_SIZE) {
+ +                      /*
+ +                       * We reached the end of the chunk page ->
+ +                       * need to refer to the next one to copy more data.
+ +                       */
+ +                      len -= size;
+ +                      if (len) {
+ +                              /* Get next page. */
+ +                              pl = pl->next;
+ +                              BUG_ON(!pl);
+ +                              BUG_ON(!pl->page);
+ +                              page_addr = page_address(pl->page);
+ +                              page_offset = 0;
+ +                              bio_offset += size;
+ +                              /* REMOVEME: statistics. */
+ +                              atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
+ +                              goto redo;
+ +                      }
+ +              }
+ +
+ +              __bio_kunmap_atomic(bio_addr, KM_USER0);
+ +      }
+ +}
+ +
+ +/*
+ + * Xor optimization macros.
+ + */
+ +/* Xor data pointer declaration and initialization macros. */
+ +#define DECLARE_2     unsigned long *d0 = data[0], *d1 = data[1]
+ +#define DECLARE_3     DECLARE_2, *d2 = data[2]
+ +#define DECLARE_4     DECLARE_3, *d3 = data[3]
+ +#define DECLARE_5     DECLARE_4, *d4 = data[4]
+ +#define DECLARE_6     DECLARE_5, *d5 = data[5]
+ +#define DECLARE_7     DECLARE_6, *d6 = data[6]
+ +#define DECLARE_8     DECLARE_7, *d7 = data[7]
+ +
+ +/* Xor unrole macros. */
+ +#define D2(n) d0[n] = d0[n] ^ d1[n]
+ +#define D3(n) D2(n) ^ d2[n]
+ +#define D4(n) D3(n) ^ d3[n]
+ +#define D5(n) D4(n) ^ d4[n]
+ +#define D6(n) D5(n) ^ d5[n]
+ +#define D7(n) D6(n) ^ d6[n]
+ +#define D8(n) D7(n) ^ d7[n]
+ +
+ +#define       X_2(macro, offset)      macro(offset); macro(offset + 1);
+ +#define       X_4(macro, offset)      X_2(macro, offset); X_2(macro, offset + 2);
+ +#define       X_8(macro, offset)      X_4(macro, offset); X_4(macro, offset + 4);
+ +#define       X_16(macro, offset)     X_8(macro, offset); X_8(macro, offset + 8);
+ +#define       X_32(macro, offset)     X_16(macro, offset); X_16(macro, offset + 16);
+ +#define       X_64(macro, offset)     X_32(macro, offset); X_32(macro, offset + 32);
+ +
+ +/* Define a _xor_#chunks_#xors_per_run() function. */
+ +#define       _XOR(chunks, xors_per_run) \
+ +static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
+ +{ \
+ +      unsigned end = XOR_SIZE / sizeof(data[0]), i; \
+ +      DECLARE_ ## chunks; \
+ +\
+ +      for (i = 0; i < end; i += xors_per_run) { \
+ +              X_ ## xors_per_run(D ## chunks, i); \
+ +      } \
+ +}
+ +
+ +/* Define xor functions for 2 - 8 chunks and xors per run. */
+ +#define       MAKE_XOR_PER_RUN(xors_per_run) \
+ +      _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
+ +      _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
+ +      _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
+ +      _XOR(8, xors_per_run);
+ +
+ +MAKE_XOR_PER_RUN(8)   /* Define _xor_*_8() functions. */
+ +MAKE_XOR_PER_RUN(16)  /* Define _xor_*_16() functions. */
+ +MAKE_XOR_PER_RUN(32)  /* Define _xor_*_32() functions. */
+ +MAKE_XOR_PER_RUN(64)  /* Define _xor_*_64() functions. */
+ +
+ +#define MAKE_XOR(xors_per_run) \
+ +struct { \
+ +      void (*f)(unsigned long **); \
+ +} static xor_funcs ## xors_per_run[] = { \
+ +      { NULL }, /* NULL pointers to optimize indexing in xor(). */ \
+ +      { NULL }, \
+ +      { _xor2_ ## xors_per_run }, \
+ +      { _xor3_ ## xors_per_run }, \
+ +      { _xor4_ ## xors_per_run }, \
+ +      { _xor5_ ## xors_per_run }, \
+ +      { _xor6_ ## xors_per_run }, \
+ +      { _xor7_ ## xors_per_run }, \
+ +      { _xor8_ ## xors_per_run }, \
+ +}; \
+ +\
+ +static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
+ +{ \
+ +      /* Call respective function for amount of chunks. */ \
+ +      xor_funcs ## xors_per_run[n].f(data); \
+ +}
+ +
+ +/* Define xor_8() - xor_64 functions. */
+ +MAKE_XOR(8)
+ +MAKE_XOR(16)
+ +MAKE_XOR(32)
+ +MAKE_XOR(64)
+ +/*
+ + * END xor optimization macros.
+ + */
+ +
+ +/* Maximum number of chunks, which can be xor'ed in one go. */
+ +#define       XOR_CHUNKS_MAX  (ARRAY_SIZE(xor_funcs8) - 1)
+ +
+ +/* xor_blocks wrapper to allow for using that crypto library function. */
+ +static void xor_blocks_wrapper(unsigned n, unsigned long **data)
+ +{
+ +      BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1);
+ +      xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1);
+ +}
+ +
+ +struct xor_func {
+ +      xor_function_t f;
+ +      const char *name;
+ +} static xor_funcs[] = {
+ +      { xor_64,  "xor_64" },
+ +      { xor_32,  "xor_32" },
+ +      { xor_16,  "xor_16" },
+ +      { xor_8,   "xor_8"  },
+ +      { xor_blocks_wrapper, "xor_blocks" },
+ +};
+ +
+ +/*
+ + * Check, if chunk has to be xored in/out:
+ + *
+ + * o if writes are queued
+ + * o if writes are merged
+ + * o if stripe is to be reconstructed
+ + * o if recovery stripe
+ + */
+ +static inline int chunk_must_xor(struct stripe_chunk *chunk)
+ +{
+ +      if (ChunkUptodate(chunk)) {
+ +              BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) &&
+ +                     !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)));
+ +
+ +              if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) ||
+ +                  !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)))
+ +                      return 1;
+ +
+ +              if (StripeReconstruct(chunk->stripe) ||
+ +                  StripeRecover(chunk->stripe))
+ +                      return 1;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Calculate crc.
+ + *
+ + * This indexes into the chunks of a stripe and their pages.
+ + *
+ + * All chunks will be xored into the indexed (@pi)
+ + * chunk in maximum groups of xor.chunks.
+ + *
+ + */
+ +static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      unsigned max_chunks = rs->xor.chunks, n = 1,
+ +               o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */
+ +               p = rs->set.raid_devs;
+ +      unsigned long **d = rs->data;
+ +      xor_function_t xor_f = rs->xor.f->f;
+ +
+ +      BUG_ON(sector > stripe->io.size);
+ +
+ +      /* Address of parity page to xor into. */
+ +      d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
+ +
+ +      while (p--) {
+ +              /* Preset pointers to data pages. */
+ +              if (p != pi && chunk_must_xor(CHUNK(stripe, p)))
+ +                      d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
+ +
+ +              /* If max chunks -> xor. */
+ +              if (n == max_chunks) {
+ +                      mutex_lock(&rs->io.xor_lock);
+ +                      xor_f(n, d);
+ +                      mutex_unlock(&rs->io.xor_lock);
+ +                      n = 1;
+ +              }
+ +      }
+ +
+ +      /* If chunks -> xor. */
+ +      if (n > 1) {
+ +              mutex_lock(&rs->io.xor_lock);
+ +              xor_f(n, d);
+ +              mutex_unlock(&rs->io.xor_lock);
+ +      }
+ +}
+ +
+ +/* Common xor loop through all stripe page lists. */
+ +static void common_xor(struct stripe *stripe, sector_t count,
+ +                     unsigned off, unsigned pi)
+ +{
+ +      unsigned sector;
+ +
+ +      BUG_ON(!count);
+ +      for (sector = off; sector < count; sector += SECTORS_PER_PAGE)
+ +              xor(stripe, pi, sector);
+ +
+ +      /* Set parity page uptodate and clean. */
+ +      chunk_set(CHUNK(stripe, pi), CLEAN);
+ +      atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
+ +}
+ +
+ +/*
+ + * Calculate parity sectors on intact stripes.
+ + *
+ + * Need to calculate raid address for recover stripe, because its
+ + * chunk sizes differs and is typically larger than io chunk size.
+ + */
+ +static void parity_xor(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      int size_differs = stripe->io.size != rs->set.io_size;
+ +      unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size,
+ +               xor_size = chunk_size > io_size ? io_size : chunk_size;
+ +      sector_t off;
+ +
+ +      /* This can be the recover stripe with a larger io size. */
+ +      for (off = 0; off < io_size; off += xor_size) {
+ +              /*
+ +               * Recover stripe is likely bigger than regular io
+ +               * ones and has no precalculated parity disk index ->
+ +               * need to calculate RAID address.
+ +               */
+ +              if (unlikely(size_differs)) {
+ +                      struct raid_address addr;
+ +
+ +                      raid_address(rs, (stripe->key + off) *
+ +                                       rs->set.data_devs, &addr);
+ +                      stripe->idx.parity = addr.pi;
+ +                      stripe_zero_pl_part(stripe, addr.pi, off, xor_size);
+ +              }
+ +
+ +              common_xor(stripe, xor_size, off, stripe->idx.parity);
+ +              chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY);
+ +      }
+ +}
+ +
+ +/* Reconstruct missing chunk. */
+ +static void stripe_reconstruct(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      int p = rs->set.raid_devs, pr = stripe->idx.recover;
+ +
+ +      BUG_ON(pr < 0);
+ +
+ +      /* Check if all but the chunk to be reconstructed are uptodate. */
+ +      while (p--)
+ +              BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p)));
+ +
+ +      /* REMOVEME: statistics. */
+ +      atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI :
+ +                                               S_RECONSTRUCT_DEV));
+ +      /* Zero chunk to be reconstructed. */
+ +      stripe_zero_chunk(stripe, pr);
+ +      common_xor(stripe, stripe->io.size, 0, pr);
+ +}
+ +
+ +/*
+ + * Recovery io throttling
+ + */
+ +/* Conditionally reset io counters. */
+ +static int recover_io_reset(struct raid_set *rs)
+ +{
+ +      unsigned long j = jiffies;
+ +
+ +      /* Pay attention to jiffies overflows. */
+ +      if (j > rs->recover.last_jiffies + HZ ||
+ +          j < rs->recover.last_jiffies) {
+ +              atomic_set(rs->recover.io_count + IO_WORK, 0);
+ +              atomic_set(rs->recover.io_count + IO_RECOVER, 0);
+ +              rs->recover.last_jiffies = j;
+ +              return 1;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Count ios. */
+ +static void recover_io_count(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +
+ +      atomic_inc(rs->recover.io_count +
+ +                 (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
+ +}
+ +
+ +/* Try getting a stripe either from the hash or from the LRU list. */
+ +static struct stripe *stripe_find(struct raid_set *rs,
+ +                                struct raid_address *addr)
+ +{
+ +      int r;
+ +      struct stripe_cache *sc = &rs->sc;
+ +      struct stripe *stripe;
+ +
+ +      /* Try stripe from hash. */
+ +      stripe = stripe_lookup(sc, addr->key);
+ +      if (stripe) {
+ +              r = stripe_get(stripe);
+ +              if (r)
+ +                      goto get_lock_failed;
+ +
+ +              atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
+ +      } else {
+ +              /* Not in hash -> try to get an LRU stripe. */
+ +              stripe = stripe_lru_pop(sc);
+ +              if (stripe) {
+ +                      /*
+ +                       * An LRU stripe may not be referenced
+ +                       * and may never have ios pending!
+ +                       */
+ +                      BUG_ON(stripe_ref(stripe));
+ +                      BUG_ON(stripe_io_ref(stripe));
+ +
+ +                      /* Remove from hash if on before reuse. */
+ +                      stripe_hash_del(stripe);
+ +
+ +                      /* Invalidate before reinserting with changed key. */
+ +                      stripe_invalidate(stripe);
+ +
+ +                      stripe->key = addr->key;
+ +                      stripe->region = dm_rh_sector_to_region(rs->recover.rh,
+ +                                                              addr->key);
+ +                      stripe->idx.parity = addr->pi;
+ +                      r = stripe_get(stripe);
+ +                      if (r)
+ +                              goto get_lock_failed;
+ +
+ +                      /* Insert stripe into the stripe hash. */
+ +                      stripe_insert(&sc->hash, stripe);
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(rs->stats + S_INSCACHE);
+ +              }
+ +      }
+ +
+ +      return stripe;
+ +
+ +get_lock_failed:
+ +      stripe_put(stripe);
+ +      return NULL;
+ +}
+ +
+ +/*
+ + * Process end io
+ + *
+ + * I need to do it here because I can't in interrupt
+ + */
+ +/* End io all bios on a bio list. */
+ +static void bio_list_endio(struct stripe *stripe, struct bio_list *bl,
+ +                         int p, int error)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      struct bio *bio;
+ +      struct page_list *pl = PL(stripe, p);
+ +      struct stripe_chunk *chunk = CHUNK(stripe, p);
+ +
+ +      /* Update region counters. */
+ +      while ((bio = bio_list_pop(bl))) {
+ +              if (bio_data_dir(bio) == WRITE)
+ +                      /* Drop io pending count for any writes. */
+ +                      dm_rh_dec(rs->recover.rh, stripe->region);
+ +              else if (!error)
+ +                      /* Copy data accross. */
+ +                      bio_copy_page_list(READ, stripe, pl, bio);
+ +
+ +              bio_endio(bio, error);
+ +
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
+ +                         S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE));
+ +
+ +              chunk_put(chunk);
+ +              stripe_put(stripe);
+ +              io_put(rs);     /* Wake any suspend waiters on last bio. */
+ +      }
+ +}
+ +
+ +/*
+ + * End io all reads/writes on a stripe copying
+ + * read data accross from stripe to bios and
+ + * decrementing region counters for writes.
+ + *
+ + * Processing of ios depeding on state:
+ + * o no chunk error -> endio ok
+ + * o degraded:
+ + *   - chunk error and read -> ignore to be requeued
+ + *   - chunk error and write -> endio ok
+ + * o dead (more than parity_devs failed) and chunk_error-> endio failed
+ + */
+ +static void stripe_endio(int rw, struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      unsigned p = rs->set.raid_devs;
+ +      int write = (rw != READ);
+ +
+ +      while (p--) {
+ +              struct stripe_chunk *chunk = CHUNK(stripe, p);
+ +              struct bio_list *bl;
+ +
+ +              BUG_ON(ChunkLocked(chunk));
+ +
+ +              bl = BL_CHUNK(chunk, rw);
+ +              if (bio_list_empty(bl))
+ +                      continue;
+ +
+ +              if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) {
+ +                      /* RAID set dead. */
+ +                      if (unlikely(RSDead(rs)))
+ +                              bio_list_endio(stripe, bl, p, -EIO);
+ +                      /* RAID set degraded. */
+ +                      else if (write)
+ +                              bio_list_endio(stripe, bl, p, 0);
+ +              } else {
+ +                      BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk));
+ +                      bio_list_endio(stripe, bl, p, 0);
+ +              }
+ +      }
+ +}
+ +
+ +/* Fail all ios hanging off all bio lists of a stripe. */
+ +static void stripe_fail_io(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      unsigned p = rs->set.raid_devs;
+ +
+ +      while (p--) {
+ +              struct stripe_chunk *chunk = CHUNK(stripe, p);
+ +              int i = ARRAY_SIZE(chunk->bl);
+ +
+ +              /* Fail all bios on all bio lists of the stripe. */
+ +              while (i--) {
+ +                      struct bio_list *bl = chunk->bl + i;
+ +
+ +                      if (!bio_list_empty(bl))
+ +                              bio_list_endio(stripe, bl, p, -EIO);
+ +              }
+ +      }
+ +
+ +      /* Put stripe on LRU list. */
+ +      BUG_ON(stripe_io_ref(stripe));
+ +      BUG_ON(stripe_ref(stripe));
+ +}
+ +
+ +/* Unlock all required chunks. */
+ +static void stripe_chunks_unlock(struct stripe *stripe)
+ +{
+ +      unsigned p = RS(stripe->sc)->set.raid_devs;
+ +      struct stripe_chunk *chunk;
+ +
+ +      while (p--) {
+ +              chunk = CHUNK(stripe, p);
+ +
+ +              if (TestClearChunkUnlock(chunk))
+ +                      ClearChunkLocked(chunk);
+ +      }
+ +}
+ +
+ +/*
+ + * Queue reads and writes to a stripe by hanging
+ + * their bios off the stripesets read/write lists.
+ + */
+ +static int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
+ +                          struct bio_list *reject)
+ +{
+ +      struct raid_address addr;
+ +      struct stripe *stripe;
+ +
+ +      stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr));
+ +      if (stripe) {
+ +              int r = 0, rw = bio_data_dir(bio);
+ +
+ +              /* Distinguish reads and writes. */
+ +              bio_list_add(BL(stripe, addr.di, rw), bio);
+ +
+ +              if (rw == READ)
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(rs->stats + S_BIOS_ADDED_READ);
+ +              else {
+ +                      /* Inrement pending write count on region. */
+ +                      dm_rh_inc(rs->recover.rh, stripe->region);
+ +                      r = 1;
+ +
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(rs->stats + S_BIOS_ADDED_WRITE);
+ +              }
+ +
+ +              /*
+ +               * Put on io (flush) list in case of
+ +               * initial bio queued to chunk.
+ +               */
+ +              if (chunk_get(CHUNK(stripe, addr.di)) == 1)
+ +                      stripe_flush_add(stripe);
+ +
+ +              return r;
+ +      }
+ +
+ +      /* Got no stripe from cache or failed to lock it -> reject bio. */
+ +      bio_list_add(reject, bio);
+ +      atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Handle all stripes by handing them to the daemon, because we can't
+ + * map their chunk pages to copy the data in interrupt context.
+ + *
+ + * We don't want to handle them here either, while interrupts are disabled.
+ + */
+ +
+ +/* Read/write endio function for dm-io (interrupt context). */
+ +static void endio(unsigned long error, void *context)
+ +{
+ +      struct stripe_chunk *chunk = context;
+ +
+ +      if (unlikely(error)) {
+ +              chunk_set(chunk, ERROR);
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR);
+ +      } else
+ +              chunk_set(chunk, CLEAN);
+ +
+ +      /*
+ +       * For recovery stripes, I need to reset locked locked
+ +       * here, because those aren't processed in do_endios().
+ +       */
+ +      if (unlikely(StripeRecover(chunk->stripe)))
+ +              ClearChunkLocked(chunk);
+ +      else
+ +              SetChunkUnlock(chunk);
+ +
+ +      /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
+ +      stripe_put_references(chunk->stripe);
+ +}
+ +
+ +/* Read/Write a chunk asynchronously. */
+ +static void stripe_chunk_rw(struct stripe *stripe, unsigned p)
+ +{
+ +      struct stripe_cache *sc = stripe->sc;
+ +      struct raid_set *rs = RS(sc);
+ +      struct dm_mem_cache_object *obj = stripe->obj + p;
+ +      struct page_list *pl = obj->pl;
+ +      struct stripe_chunk *chunk = CHUNK(stripe, p);
+ +      struct raid_dev *dev = rs->dev + p;
+ +      struct dm_io_region io = {
+ +              .bdev = dev->dev->bdev,
+ +              .sector = stripe->key,
+ +              .count = stripe->io.size,
+ +      };
+ +      struct dm_io_request control = {
+ +              .bi_rw = ChunkDirty(chunk) ? WRITE : READ,
+ +              .mem = {
+ +                      .type = DM_IO_PAGE_LIST,
+ +                      .ptr.pl = pl,
+ +                      .offset = 0,
+ +              },
+ +              .notify = {
+ +                      .fn = endio,
+ +                      .context = chunk,
+ +              },
+ +              .client = StripeRecover(stripe) ? rs->recover.dm_io_client :
+ +                                                sc->dm_io_client,
+ +      };
+ +
+ +      BUG_ON(ChunkLocked(chunk));
+ +      BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk));
+ +      BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk));
+ +
+ +      /*
+ +       * Don't rw past end of device, which can happen, because
+ +       * typically sectors_per_dev isn't divisible by io_size.
+ +       */
+ +      if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
+ +              io.count = rs->set.sectors_per_dev - io.sector;
+ +
+ +      BUG_ON(!io.count);
+ +      io.sector += dev->start;        /* Add <offset>. */
+ +      if (RSRecover(rs))
+ +              recover_io_count(stripe);       /* Recovery io accounting. */
+ +
+ +      /* REMOVEME: statistics. */
+ +      atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE :
+ +                                                  S_DM_IO_READ));
+ +      SetChunkLocked(chunk);
+ +      SetDevIoQueued(dev);
+ +      BUG_ON(dm_io(&control, 1, &io, NULL));
+ +}
+ +
+ +/*
+ + * Write dirty or read not uptodate page lists of a stripe.
+ + */
+ +static int stripe_chunks_rw(struct stripe *stripe)
+ +{
+ +      int r;
+ +      struct raid_set *rs = RS(stripe->sc);
+ +
+ +      /*
+ +       * Increment the pending count on the stripe
+ +       * first, so that we don't race in endio().
+ +       *
+ +       * An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
+ +       *
+ +       * o not uptodate
+ +       * o dirtied by writes merged
+ +       * o dirtied by parity calculations
+ +       */
+ +      r = for_each_io_dev(stripe, stripe_get_references);
+ +      if (r) {
+ +              /* Io needed: chunks are either not uptodate or dirty. */
+ +              int max;        /* REMOVEME: */
+ +              struct stripe_cache *sc = &rs->sc;
+ +
+ +              /* Submit actual io. */
+ +              for_each_io_dev(stripe, stripe_chunk_rw);
+ +
+ +              /* REMOVEME: statistics */
+ +              max = sc_active(sc);
+ +              if (atomic_read(&sc->active_stripes_max) < max)
+ +                      atomic_set(&sc->active_stripes_max, max);
+ +
+ +              atomic_inc(rs->stats + S_FLUSHS);
+ +              /* END REMOVEME: statistics */
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +/* Merge in all writes hence dirtying respective chunks. */
+ +static void stripe_merge_writes(struct stripe *stripe)
+ +{
+ +      unsigned p = RS(stripe->sc)->set.raid_devs;
+ +
+ +      while (p--) {
+ +              struct stripe_chunk *chunk = CHUNK(stripe, p);
+ +              struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED);
+ +
+ +              if (!bio_list_empty(write)) {
+ +                      struct bio *bio;
+ +                      struct page_list *pl = stripe->obj[p].pl;
+ +
+ +                      /*
+ +                       * We can play with the lists without holding a lock,
+ +                       * because it is just us accessing them anyway.
+ +                       */
+ +                      bio_list_for_each(bio, write)
+ +                              bio_copy_page_list(WRITE, stripe, pl, bio);
+ +
+ +                      bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write);
+ +                      bio_list_init(write);
+ +                      chunk_set(chunk, DIRTY);
+ +              }
+ +      }
+ +}
+ +
+ +/* Queue all writes to get merged. */
+ +static int stripe_queue_writes(struct stripe *stripe)
+ +{
+ +      int r = 0;
+ +      unsigned p = RS(stripe->sc)->set.raid_devs;
+ +
+ +      while (p--) {
+ +              struct stripe_chunk *chunk = CHUNK(stripe, p);
+ +              struct bio_list *write = BL_CHUNK(chunk, WRITE);
+ +
+ +              if (!bio_list_empty(write)) {
+ +                      bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write);
+ +                      bio_list_init(write);
+ +SetChunkIo(chunk);
+ +                      r = 1;
+ +              }
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +
+ +/* Check, if a chunk gets completely overwritten. */
+ +static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p)
+ +{
+ +      unsigned sectors = 0;
+ +      struct bio *bio;
+ +      struct bio_list *bl = BL(stripe, p, WRITE_QUEUED);
+ +
+ +      bio_list_for_each(bio, bl)
+ +              sectors += bio_sectors(bio);
+ +
+ +      BUG_ON(sectors > RS(stripe->sc)->set.io_size);
+ +      return sectors == RS(stripe->sc)->set.io_size;
+ +}
+ +
+ +/*
+ + * Avoid io on broken/reconstructed drive in order to
+ + * reconstruct date on endio.
+ + *
+ + * (*1*) We set StripeReconstruct() in here, so that _do_endios()
+ + *     will trigger a reconstruct call before resetting it.
+ + */
+ +static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr)
+ +{
+ +      struct stripe_chunk *chunk = CHUNK(stripe, pr);
+ +
+ +      /*
+ +       * Allow io on all chunks but the indexed one,
+ +       * because we're either degraded or prohibit it
+ +       * on the one for later reconstruction.
+ +       */
+ +      /* Includes ClearChunkIo(), ClearChunkUptodate(). */
+ +      stripe_chunk_invalidate(chunk);
+ +      stripe->idx.recover = pr;
+ +      SetStripeReconstruct(stripe);
+ +
+ +      /* REMOVEME: statistics. */
+ +      atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
+ +      return -EPERM;
+ +}
+ +
+ +/* Chunk locked/uptodate and device failed tests. */
+ +static struct stripe_chunk *
+ +stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      struct stripe_chunk *chunk = CHUNK(stripe, p);
+ +
+ +      /* Can't access active chunks. */
+ +      if (ChunkLocked(chunk)) {
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(rs->stats + S_CHUNK_LOCKED);
+ +              return NULL;
+ +      }
+ +
+ +      /* Can't access broken devive. */
+ +      if (ChunkError(chunk) || DevFailed(rs->dev + p))
+ +              return NULL;
+ +
+ +      /* Can access uptodate chunks. */
+ +      if (ChunkUptodate(chunk)) {
+ +              (*chunks_uptodate)++;
+ +              return NULL;
+ +      }
+ +
+ +      return chunk;
+ +}
+ +
+ +/*
+ + * Degraded/reconstruction mode.
+ + *
+ + * Check stripe state to figure which chunks don't need IO.
+ + *
+ + * Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
+ + */
+ +static int stripe_check_reconstruct(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +
+ +      if (RSDead(rs)) {
+ +              ClearStripeReconstruct(stripe);
+ +              ClearStripeReconstructed(stripe);
+ +              stripe_allow_io(stripe);
+ +              return 0;
+ +      }
+ +
+ +      /* Avoid further reconstruction setting, when already set. */
+ +      if (StripeReconstruct(stripe)) {
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(rs->stats + S_RECONSTRUCT_SET);
+ +              return -EBUSY;
+ +      }
+ +
+ +      /* Initially allow io on all chunks. */
+ +      stripe_allow_io(stripe);
+ +
+ +      /* Return if stripe is already reconstructed. */
+ +      if (StripeReconstructed(stripe)) {
+ +              atomic_inc(rs->stats + S_RECONSTRUCTED);
+ +              return 0;
+ +      }
+ +
+ +      /*
+ +       * Degraded/reconstruction mode (device failed) ->
+ +       * avoid io on the failed device.
+ +       */
+ +      if (unlikely(RSDegraded(rs))) {
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(rs->stats + S_DEGRADED);
+ +              /* Allow IO on all devices but the dead one. */
+ +              BUG_ON(rs->set.ei < 0);
+ +              return stripe_chunk_set_io_flags(stripe, rs->set.ei);
+ +      } else {
+ +              int sync, pi = dev_for_parity(stripe, &sync);
+ +
+ +              /*
+ +               * Reconstruction mode (ie. a particular (replaced) device or
+ +               * some (rotating) parity chunk is being resynchronized) ->
+ +               *   o make sure all needed chunks are read in
+ +               *   o cope with 3/4 disk array special case where it
+ +               *     doesn't make a difference to read in parity
+ +               *     to xor data in/out
+ +               */
+ +              if (RSEnforceParityCreation(rs) || !sync) {
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(rs->stats + S_NOSYNC);
+ +                      /* Allow IO on all devs but the one to reconstruct. */
+ +                      return stripe_chunk_set_io_flags(stripe, pi);
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Check, if stripe is ready to merge writes.
+ + * I.e. if all chunks present to allow to merge bios.
+ + *
+ + * We prohibit io on:
+ + *
+ + * o chunks without bios
+ + * o chunks which get completely written over
+ + */
+ +static int stripe_merge_possible(struct stripe *stripe, int nosync)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      unsigned chunks_overwrite = 0, chunks_prohibited = 0,
+ +               chunks_uptodate = 0, p = rs->set.raid_devs;
+ +
+ +      /* Walk all chunks. */
+ +      while (p--) {
+ +              struct stripe_chunk *chunk;
+ +
+ +              /* Prohibit io on broken devices. */
+ +              if (DevFailed(rs->dev + p)) {
+ +                      chunk = CHUNK(stripe, p);
+ +                      goto prohibit_io;
+ +              }
+ +
+ +              /* We can't optimize any further if no chunk. */
+ +              chunk = stripe_chunk_check(stripe, p, &chunks_uptodate);
+ +              if (!chunk || nosync)
+ +                      continue;
+ +
+ +              /*
+ +               * We have a chunk, which is not uptodate.
+ +               *
+ +               * If this is not parity and we don't have
+ +               * reads queued, we can optimize further.
+ +               */
+ +              if (p != stripe->idx.parity &&
+ +                  bio_list_empty(BL_CHUNK(chunk, READ)) &&
+ +                  bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) {
+ +                      if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)))
+ +                              goto prohibit_io;
+ +                      else if (RSCheckOverwrite(rs) &&
+ +                               stripe_check_chunk_overwrite(stripe, p))
+ +                              /* Completely overwritten chunk. */
+ +                              chunks_overwrite++;
+ +              }
+ +
+ +              /* Allow io for chunks with bios and overwritten ones. */
+ +              SetChunkIo(chunk);
+ +              continue;
+ +
+ +prohibit_io:
+ +              /* No io for broken devices or for chunks w/o bios. */
+ +              ClearChunkIo(chunk);
+ +              chunks_prohibited++;
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
+ +      }
+ +
+ +      /* All data chunks will get written over. */
+ +      if (chunks_overwrite == rs->set.data_devs)
+ +              atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/
+ +      else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) {
+ +              /* We don't have enough chunks to merge. */
+ +              atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/
+ +              return -EPERM;
+ +      }
+ +
+ +      /*
+ +       * If we have all chunks up to date or overwrite them, we
+ +       * just zero the parity chunk and let stripe_rw() recreate it.
+ +       */
+ +      if (chunks_uptodate == rs->set.raid_devs ||
+ +          chunks_overwrite == rs->set.data_devs) {
+ +              stripe_zero_chunk(stripe, stripe->idx.parity);
+ +              BUG_ON(StripeReconstruct(stripe));
+ +              SetStripeReconstruct(stripe);   /* Enforce xor in caller. */
+ +      } else {
+ +              /*
+ +               * With less chunks, we xor parity out.
+ +               *
+ +               * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
+ +               *       so that only chunks with queued or merged writes
+ +               *       are being xored.
+ +               */
+ +              parity_xor(stripe);
+ +      }
+ +
+ +      /*
+ +       * We do have enough chunks to merge.
+ +       * All chunks are uptodate or get written over.
+ +       */
+ +      atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Avoid reading chunks in case we're fully operational.
+ + *
+ + * We prohibit io on any chunks without bios but the parity chunk.
+ + */
+ +static void stripe_avoid_reads(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      unsigned dummy = 0, p = rs->set.raid_devs;
+ +
+ +      /* Walk all chunks. */
+ +      while (p--) {
+ +              struct stripe_chunk *chunk =
+ +                      stripe_chunk_check(stripe, p, &dummy);
+ +
+ +              if (!chunk)
+ +                      continue;
+ +
+ +              /* If parity or any bios pending -> allow io. */
+ +              if (chunk_ref(chunk) || p == stripe->idx.parity)
+ +                      SetChunkIo(chunk);
+ +              else {
+ +                      ClearChunkIo(chunk);
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
+ +              }
+ +      }
+ +}
+ +
+ +/*
+ + * Read/write a stripe.
+ + *
+ + * All stripe read/write activity goes through this function
+ + * unless recovery, which has to call stripe_chunk_rw() directly.
+ + *
+ + * Make sure we don't try already merged stripes in order
+ + * to avoid data corruption.
+ + *
+ + * Check the state of the RAID set and if degraded (or
+ + * resynchronizing for reads), read in all other chunks but
+ + * the one on the dead/resynchronizing device in order to be
+ + * able to reconstruct the missing one in _do_endios().
+ + *
+ + * Can be called on active stripes in order
+ + * to dispatch new io on inactive chunks.
+ + *
+ + * States to cover:
+ + *   o stripe to read and/or write
+ + *   o stripe with error to reconstruct
+ + */
+ +static int stripe_rw(struct stripe *stripe)
+ +{
+ +      int nosync, r;
+ +      struct raid_set *rs = RS(stripe->sc);
+ +
+ +      /*
+ +       * Check, if a chunk needs to be reconstructed
+ +       * because of a degraded set or a region out of sync.
+ +       */
+ +      nosync = stripe_check_reconstruct(stripe);
+ +      switch (nosync) {
+ +      case -EBUSY:
+ +              return 0; /* Wait for stripe reconstruction to finish. */
+ +      case -EPERM:
+ +              goto io;
+ +      }
+ +
+ +      /*
+ +       * If we don't have merged writes pending, we can schedule
+ +       * queued writes to be merged next without corrupting data.
+ +       */
+ +      if (!StripeMerged(stripe)) {
+ +              r = stripe_queue_writes(stripe);
+ +              if (r)
+ +                      /* Writes got queued -> flag RBW. */
+ +                      SetStripeRBW(stripe);
+ +      }
+ +
+ +      /*
+ +       * Merge all writes hanging off uptodate/overwritten
+ +       * chunks of the stripe.
+ +       */
+ +      if (StripeRBW(stripe)) {
+ +              r = stripe_merge_possible(stripe, nosync);
+ +              if (!r) { /* Merge possible. */
+ +                      struct stripe_chunk *chunk;
+ +
+ +                      /*
+ +                       * I rely on valid parity in order
+ +                       * to xor a fraction of chunks out
+ +                       * of parity and back in.
+ +                       */
+ +                      stripe_merge_writes(stripe);    /* Merge writes in. */
+ +                      parity_xor(stripe);             /* Update parity. */
+ +                      ClearStripeReconstruct(stripe); /* Reset xor enforce. */
+ +                      SetStripeMerged(stripe);        /* Writes merged. */
+ +                      ClearStripeRBW(stripe);         /* Disable RBW. */
+ +
+ +                      /*
+ +                       * REMOVEME: sanity check on parity chunk
+ +                       *           states after writes got merged.
+ +                       */
+ +                      chunk = CHUNK(stripe, stripe->idx.parity);
+ +                      BUG_ON(ChunkLocked(chunk));
+ +                      BUG_ON(!ChunkUptodate(chunk));
+ +                      BUG_ON(!ChunkDirty(chunk));
+ +                      BUG_ON(!ChunkIo(chunk));
+ +              }
+ +      } else if (!nosync && !StripeMerged(stripe))
+ +              /* Read avoidance if not degraded/resynchronizing/merged. */
+ +              stripe_avoid_reads(stripe);
+ +
+ +io:
+ +      /* Now submit any reads/writes for non-uptodate or dirty chunks. */
+ +      r = stripe_chunks_rw(stripe);
+ +      if (!r) {
+ +              /*
+ +               * No io submitted because of chunk io
+ +               * prohibited or locked chunks/failed devices
+ +               * -> push to end io list for processing.
+ +               */
+ +              stripe_endio_push(stripe);
+ +              atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +/*
+ + * Recovery functions
+ + */
+ +/* Read a stripe off a raid set for recovery. */
+ +static int stripe_recover_read(struct stripe *stripe, int pi)
+ +{
+ +      BUG_ON(stripe_io_ref(stripe));
+ +
+ +      /* Invalidate all chunks so that they get read in. */
+ +      stripe_chunks_invalidate(stripe);
+ +      stripe_allow_io(stripe); /* Allow io on all recovery chunks. */
+ +
+ +      /*
+ +       * If we are reconstructing a perticular device, we can avoid
+ +       * reading the respective chunk in, because we're going to
+ +       * reconstruct it anyway.
+ +       *
+ +       * We can't do that for resynchronization of rotating parity,
+ +       * because the recovery stripe chunk size is typically larger
+ +       * than the sets chunk size.
+ +       */
+ +      if (pi > -1)
+ +              ClearChunkIo(CHUNK(stripe, pi));
+ +
+ +      return stripe_chunks_rw(stripe);
+ +}
+ +
+ +/* Write a stripe to a raid set for recovery. */
+ +static int stripe_recover_write(struct stripe *stripe, int pi)
+ +{
+ +      BUG_ON(stripe_io_ref(stripe));
+ +
+ +      /*
+ +       * If this is a reconstruct of a particular device, then
+ +       * reconstruct the respective chunk, else create parity chunk.
+ +       */
+ +      if (pi > -1) {
+ +              stripe_zero_chunk(stripe, pi);
+ +              common_xor(stripe, stripe->io.size, 0, pi);
+ +              chunk_set(CHUNK(stripe, pi), DIRTY);
+ +      } else
+ +              parity_xor(stripe);
+ +
+ +      return stripe_chunks_rw(stripe);
+ +}
+ +
+ +/* Read/write a recovery stripe. */
+ +static int stripe_recover_rw(struct stripe *stripe)
+ +{
+ +      int r = 0, sync = 0;
+ +
+ +      /* Read/write flip-flop. */
+ +      if (TestClearStripeRBW(stripe)) {
+ +              SetStripeMerged(stripe);
+ +              stripe->key = stripe->recover->pos;
+ +              r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync));
+ +              BUG_ON(!r);
+ +      } else if (TestClearStripeMerged(stripe)) {
+ +              r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync));
+ +              BUG_ON(!r);
+ +      }
+ +
+ +      BUG_ON(sync);
+ +      return r;
+ +}
+ +
+ +/* Recover bandwidth available ?. */
+ +static int recover_bandwidth(struct raid_set *rs)
+ +{
+ +      int r, work;
+ +
+ +      /* On reset or when bios delayed -> allow recovery. */
+ +      r = recover_io_reset(rs);
+ +      if (r || RSBandwidth(rs))
+ +              goto out;
+ +
+ +      work = atomic_read(rs->recover.io_count + IO_WORK);
+ +      if (work) {
+ +              /* Pay attention to larger recover stripe size. */
+ +              int recover = atomic_read(rs->recover.io_count + IO_RECOVER) *
+ +                                        rs->recover.io_size / rs->set.io_size;
+ +
+ +              /*
+ +               * Don't use more than given bandwidth
+ +               * of the work io for recovery.
+ +               */
+ +              if (recover > work / rs->recover.bandwidth_work) {
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(rs->stats + S_NO_BANDWIDTH);
+ +                      return 0;
+ +              }
+ +      }
+ +
+ +out:
+ +      atomic_inc(rs->stats + S_BANDWIDTH);    /* REMOVEME: statistics. */
+ +      return 1;
+ +}
+ +
+ +/* Try to get a region to recover. */
+ +static int stripe_recover_get_region(struct stripe *stripe)
+ +{
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      struct recover *rec = &rs->recover;
+ +      struct recover_addr *addr = stripe->recover;
+ +      struct dm_dirty_log *dl = rec->dl;
+ +      struct dm_rh_client *rh = rec->rh;
+ +
+ +      BUG_ON(!dl);
+ +      BUG_ON(!rh);
+ +
+ +      /* Return, that we have region first to finish it during suspension. */
+ +      if (addr->reg)
+ +              return 1;
+ +
+ +      if (RSSuspend(rs))
+ +              return -EPERM;
+ +
+ +      if (dl->type->get_sync_count(dl) >= rec->nr_regions)
+ +              return -ENOENT;
+ +
+ +      /* If we don't have enough bandwidth, we don't proceed recovering. */
+ +      if (!recover_bandwidth(rs))
+ +              return -EAGAIN;
+ +
+ +      /* Start quiescing a region. */
+ +      dm_rh_recovery_prepare(rh);
+ +      addr->reg = dm_rh_recovery_start(rh);
+ +      if (!addr->reg)
+ +              return -EAGAIN;
+ +
+ +      addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg));
+ +      addr->end = addr->pos + dm_rh_get_region_size(rh);
+ +
+ +      /*
+ +       * Take one global io reference out for the
+ +       * whole region, which is going to be released
+ +       * when the region is completely done with.
+ +       */
+ +      io_get(rs);
+ +      return 0;
+ +}
+ +
+ +/* Update region hash state. */
+ +enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 };
+ +static void recover_rh_update(struct stripe *stripe, enum recover_type success)
+ +{
+ +      struct recover_addr *addr = stripe->recover;
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      struct recover *rec = &rs->recover;
+ +
+ +      if (!addr->reg) {
+ +              DMERR("%s- Called w/o region", __func__);
+ +              return;
+ +      }
+ +
+ +      dm_rh_recovery_end(addr->reg, success);
+ +      if (success)
+ +              rec->nr_regions_recovered++;
+ +
+ +      addr->reg = NULL;
+ +
+ +      /*
+ +       * Completely done with this region ->
+ +       * release the 1st io reference.
+ +       */
+ +      io_put(rs);
+ +}
+ +
+ +/* Set start of recovery state. */
+ +static void set_start_recovery(struct raid_set *rs)
+ +{
+ +      /* Initialize recovery. */
+ +      rs->recover.start_jiffies = jiffies;
+ +      rs->recover.end_jiffies = 0;
+ +}
+ +
+ +/* Set end of recovery state. */
+ +static void set_end_recovery(struct raid_set *rs)
+ +{
+ +      ClearRSRecover(rs);
+ +/* Achtung: nicht mehr zurück setzten -> 'i' belibt in status output und userpace könnte sich darauf verlassen, das es verschiwndet!!!! */
+ +      rs->set.dev_to_init = -1;
+ +
+ +      /* Check for jiffies overrun. */
+ +      rs->recover.end_jiffies = jiffies;
+ +      if (rs->recover.end_jiffies < rs->recover.start_jiffies)
+ +              rs->recover.end_jiffies = ~0;
+ +}
+ +
+ +/* Handle recovery on one recovery stripe. */
+ +static int _do_recovery(struct stripe *stripe)
+ +{
+ +      int r;
+ +      struct raid_set *rs = RS(stripe->sc);
+ +      struct recover_addr *addr = stripe->recover;
+ +
+ +      /* If recovery is active -> return. */
+ +      if (stripe_io_ref(stripe))
+ +              return 1;
+ +
+ +      /* IO error is fatal for recovery -> stop it. */
+ +      if (unlikely(StripeError(stripe)))
+ +              goto err;
+ +
+ +      /* Recovery end required. */
+ +      if (unlikely(RSDegraded(rs)))
+ +              goto err;
+ +
+ +      /* Get a region to recover. */
+ +      r = stripe_recover_get_region(stripe);
+ +      switch (r) {
+ +      case 0: /* Got a new region: flag initial read before write. */
+ +              SetStripeRBW(stripe);
+ +      case 1: /* Have a region in the works. */
+ +              break;
+ +      case -EAGAIN:
+ +              /* No bandwidth/quiesced region yet, try later. */
+ +              if (!io_ref(rs))
+ +                      wake_do_raid_delayed(rs, HZ / 4);
+ +      case -EPERM:
+ +              /* Suspend. */
+ +              return 1;
+ +      case -ENOENT:   /* No more regions to recover. */
+ +              schedule_work(&rs->io.ws_do_table_event);
+ +              return 0;
+ +      default:
+ +              BUG();
+ +      }
+ +
+ +      /* Read/write a recover stripe. */
+ +      r = stripe_recover_rw(stripe);
+ +      if (r)
+ +              /* IO initiated. */
+ +              return 1;
+ +
+ +      /* Read and write finished-> update recovery position within region. */
+ +      addr->pos += stripe->io.size;
+ +
+ +      /* If we're at end of region, update region hash. */
+ +      if (addr->pos >= addr->end ||
+ +          addr->pos >= rs->set.sectors_per_dev)
+ +              recover_rh_update(stripe, REC_SUCCESS);
+ +      else
+ +              /* Prepare to read next region segment. */
+ +              SetStripeRBW(stripe);
+ +
+ +      /* Schedule myself for another round... */
+ +      wake_do_raid(rs);
+ +      return 1;
+ +
+ +err:
+ +      /* FIXME: rather try recovering other regions on error? */
+ +      rs_check_degrade(stripe);
+ +      recover_rh_update(stripe, REC_FAILURE);
+ +
+ +      /* Check state of partially recovered array. */
+ +      if (RSDegraded(rs) && !RSDead(rs) &&
+ +          rs->set.dev_to_init != -1 &&
+ +          rs->set.ei != rs->set.dev_to_init) {
+ +              /* Broken drive != drive to recover -> FATAL. */
+ +              SetRSDead(rs);
+ +              DMERR("FATAL: failed device != device to initialize -> "
+ +                    "RAID set broken");
+ +      }
+ +
+ +      if (StripeError(stripe) || RSDegraded(rs)) {
+ +              char buf[BDEVNAME_SIZE];
+ +
+ +              DMERR("stopping recovery due to "
+ +                    "ERROR on /dev/%s, stripe at offset %llu",
+ +                    bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
+ +                    (unsigned long long) stripe->key);
+ +
+ +      }
+ +
+ +      /* Make sure, that all quiesced regions get released. */
+ +      while (addr->reg) {
+ +              dm_rh_recovery_end(addr->reg, -EIO);
+ +              addr->reg = dm_rh_recovery_start(rs->recover.rh);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Called by main io daemon to recover regions. */
+ +static int do_recovery(struct raid_set *rs)
+ +{
+ +      if (RSRecover(rs)) {
+ +              int r = 0;
+ +              struct stripe *stripe;
+ +
+ +              list_for_each_entry(stripe, &rs->recover.stripes,
+ +                                  lists[LIST_RECOVER])
+ +                      r += _do_recovery(stripe);
+ +
+ +              if (r)
+ +                      return r;
+ +
+ +              set_end_recovery(rs);
+ +              stripe_recover_free(rs);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * END recovery functions
+ + */
+ +
+ +/* End io process all stripes handed in by endio() callback. */
+ +static void _do_endios(struct raid_set *rs, struct stripe *stripe,
+ +                     struct list_head *flush_list)
+ +{
+ +      /* First unlock all required chunks. */
+ +      stripe_chunks_unlock(stripe);
+ +
+ +      /*
+ +       * If an io error on a stripe occured, degrade the RAID set
+ +       * and try to endio as many bios as possible. If any bios can't
+ +       * be endio processed, requeue the stripe (stripe_ref() != 0).
+ +       */
+ +      if (TestClearStripeError(stripe)) {
+ +              /*
+ +               * FIXME: if read, rewrite the failed chunk after reconstruction
+ +               *        in order to trigger disk bad sector relocation.
+ +               */
+ +              rs_check_degrade(stripe); /* Resets ChunkError(). */
+ +              ClearStripeReconstruct(stripe);
+ +              ClearStripeReconstructed(stripe);
+ +
+ +              /*
+ +               * FIXME: if write, don't endio writes in flight and don't
+ +               *        allow for new writes until userspace has updated
+ +               *        its metadata.
+ +               */
+ +      }
+ +
+ +      /* Got to reconstruct a missing chunk. */
+ +      if (StripeReconstruct(stripe)) {
+ +              /*
+ +               * (*2*) We use StripeReconstruct() to allow for
+ +               *       all chunks to be xored into the reconstructed
+ +               *       one (see chunk_must_xor()).
+ +               */
+ +              stripe_reconstruct(stripe);
+ +
+ +              /*
+ +               * (*3*) Now we reset StripeReconstruct() and flag
+ +               *       StripeReconstructed() to show to stripe_rw(),
+ +               *       that we have reconstructed a missing chunk.
+ +               */
+ +              ClearStripeReconstruct(stripe);
+ +              SetStripeReconstructed(stripe);
+ +
+ +              /* FIXME: reschedule to be written in case of read. */
+ +              /* if (!RSDead && RSDegraded(rs) !StripeRBW(stripe)) {
+ +                      chunk_set(CHUNK(stripe, stripe->idx.recover), DIRTY);
+ +                      stripe_chunks_rw(stripe);
+ +              } */
+ +
+ +              stripe->idx.recover = -1;
+ +      }
+ +
+ +      /*
+ +       * Now that we eventually got a complete stripe, we
+ +       * can process the rest of the end ios on reads.
+ +       */
+ +      stripe_endio(READ, stripe);
+ +
+ +      /* End io all merged writes if not prohibited. */
+ +      if (!RSProhibitWrites(rs) && StripeMerged(stripe)) {
+ +              ClearStripeMerged(stripe);
+ +              stripe_endio(WRITE_MERGED, stripe);
+ +      }
+ +
+ +      /* If RAID set is dead -> fail any ios to dead drives. */
+ +      if (RSDead(rs)) {
+ +              if (!TestSetRSDeadEndioMessage(rs))
+ +                      DMERR("RAID set dead: failing ios to dead devices");
+ +
+ +              stripe_fail_io(stripe);
+ +      }
+ +
+ +      /*
+ +       * We have stripe references still,
+ +       * beacuse of read before writes or IO errors ->
+ +       * got to put on flush list for processing.
+ +       */
+ +      if (stripe_ref(stripe)) {
+ +              BUG_ON(!list_empty(stripe->lists + LIST_LRU));
+ +              list_add_tail(stripe->lists + LIST_FLUSH, flush_list);
+ +              atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */
+ +      } else
+ +              stripe_lru_add(stripe);
+ +}
+ +
+ +/* Pop any endio stripes off of the endio list and belabour them. */
+ +static void do_endios(struct raid_set *rs)
+ +{
+ +      struct stripe_cache *sc = &rs->sc;
+ +      struct stripe *stripe;
+ +      /* IO flush list for sorted requeued stripes. */
+ +      struct list_head flush_list;
+ +
+ +      INIT_LIST_HEAD(&flush_list);
+ +
+ +      while ((stripe = stripe_endio_pop(sc))) {
+ +              /* Avoid endio on stripes with newly io'ed chunks. */
+ +              if (!stripe_io_ref(stripe))
+ +                      _do_endios(rs, stripe, &flush_list);
+ +      }
+ +
+ +      /*
+ +       * Insert any requeued stripes in the proper
+ +       * order at the beginning of the io (flush) list.
+ +       */
+ +      list_splice(&flush_list, sc->lists + LIST_FLUSH);
+ +}
+ +
+ +/* Flush any stripes on the io list. */
+ +static int do_flush(struct raid_set *rs)
+ +{
+ +      int r = 0;
+ +      struct stripe *stripe;
+ +
+ +      while ((stripe = stripe_io_pop(&rs->sc)))
+ +              r += stripe_rw(stripe); /* Read/write stripe. */
+ +
+ +      return r;
+ +}
+ +
+ +/* Stripe cache resizing. */
+ +static void do_sc_resize(struct raid_set *rs)
+ +{
+ +      unsigned set = atomic_read(&rs->sc.stripes_to_set);
+ +
+ +      if (set) {
+ +              unsigned cur = atomic_read(&rs->sc.stripes);
+ +              int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) :
+ +                                    sc_shrink(&rs->sc, cur - set);
+ +
+ +              /* Flag end of resizeing if ok. */
+ +              if (!r)
+ +                      atomic_set(&rs->sc.stripes_to_set, 0);
+ +      }
+ +}
+ +
+ +/*
+ + * Process all ios
+ + *
+ + * We do different things with the io depending
+ + * on the state of the region that it is in:
+ + *
+ + * o reads: hang off stripe cache or postpone if full
+ + *
+ + * o writes:
+ + *
+ + *  CLEAN/DIRTY/NOSYNC:       increment pending and hang io off stripe's stripe set.
+ + *                    In case stripe cache is full or busy, postpone the io.
+ + *
+ + *  RECOVERING:               delay the io until recovery of the region completes.
+ + *
+ + */
+ +static void do_ios(struct raid_set *rs, struct bio_list *ios)
+ +{
+ +      int r;
+ +      unsigned flush = 0, delay = 0;
+ +      sector_t sector;
+ +      struct dm_rh_client *rh = rs->recover.rh;
+ +      struct bio *bio;
+ +      struct bio_list reject;
+ +
+ +      bio_list_init(&reject);
+ +
+ +      /*
+ +       * Classify each io:
+ +       *    o delay writes to recovering regions (let reads go through)
+ +       *    o queue io to all other regions
+ +       */
+ +      while ((bio = bio_list_pop(ios))) {
+ +              /*
+ +               * In case we get a barrier bio, push it back onto
+ +               * the input queue unless all work queues are empty
+ +               * and the stripe cache is inactive.
+ +               */
+ +              if (bio->bi_rw & REQ_FLUSH) {
+ +                      /* REMOVEME: statistics. */
+ +                      atomic_inc(rs->stats + S_BARRIER);
+ +                      if (delay ||
+ +                          !list_empty(rs->sc.lists + LIST_FLUSH) ||
+ +                          !bio_list_empty(&reject) ||
+ +                          sc_active(&rs->sc)) {
+ +                              bio_list_push(ios, bio);
+ +                              break;
+ +                      }
+ +              }
+ +
+ +              /* If writes prohibited because of failures -> postpone. */
+ +              if (RSProhibitWrites(rs) && bio_data_dir(bio) == WRITE) {
+ +                      bio_list_add(&reject, bio);
+ +                      continue;
+ +              }
+ +
+ +              /* Check for recovering regions. */
+ +              sector = _sector(rs, bio);
+ +              r = region_state(rs, sector, DM_RH_RECOVERING);
+ +              if (unlikely(r)) {
+ +                      delay++;
+ +                      /* Wait writing to recovering regions. */
+ +                      dm_rh_delay_by_region(rh, bio,
+ +                                            dm_rh_sector_to_region(rh,
+ +                                                                   sector));
+ +                      /* REMOVEME: statistics.*/
+ +                      atomic_inc(rs->stats + S_DELAYED_BIOS);
+ +                      atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
+ +
+ +                      /* Force bandwidth tests in recovery. */
+ +                      SetRSBandwidth(rs);
+ +              } else {
+ +                      /*
+ +                       * Process ios to non-recovering regions by queueing
+ +                       * them to stripes (does dm_rh_inc()) for writes).
+ +                       */
+ +                      flush += stripe_queue_bio(rs, bio, &reject);
+ +              }
+ +      }
+ +
+ +      if (flush) {
+ +              /* FIXME: better error handling. */
+ +              r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
+ +              if (r)
+ +                      DMERR_LIMIT("dirty log flush");
+ +      }
+ +
+ +      /* Merge any rejected bios back to the head of the input list. */
+ +      bio_list_merge_head(ios, &reject);
+ +}
+ +
+ +/* Send an event in case we're getting too busy. */
+ +static void do_busy_event(struct raid_set *rs)
+ +{
+ +      if (sc_busy(rs)) {
+ +              if (!TestSetRSScBusy(rs))
+ +                      schedule_work(&rs->io.ws_do_table_event);
+ +      } else
+ +              ClearRSScBusy(rs);
+ +}
+ +
+ +/* Throw an event. */
+ +static void do_table_event(struct work_struct *ws)
+ +{
+ +      struct raid_set *rs = container_of(ws, struct raid_set,
+ +                                         io.ws_do_table_event);
+ +      dm_table_event(rs->ti->table);
+ +}
+ +
+ +
+ +/*-----------------------------------------------------------------
+ + * RAID daemon
+ + *---------------------------------------------------------------*/
+ +/*
+ + * o belabour all end ios
+ + * o update the region hash states
+ + * o optionally shrink the stripe cache
+ + * o optionally do recovery
+ + * o unplug any component raid devices with queued bios
+ + * o grab the input queue
+ + * o work an all requeued or new ios and perform stripe cache flushs
+ + * o unplug any component raid devices with queued bios
+ + * o check, if the stripe cache gets too busy and throw an event if so
+ + */
+ +static void do_raid(struct work_struct *ws)
+ +{
+ +      int r;
+ +      struct raid_set *rs = container_of(ws, struct raid_set,
+ +                                         io.dws_do_raid.work);
+ +      struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
+ +
+ +      /*
+ +       * We always need to end io, so that ios can get errored in
+ +       * case the set failed and the region counters get decremented
+ +       * before we update region hash states and go any further.
+ +       */
+ +      do_endios(rs);
+ +      dm_rh_update_states(rs->recover.rh, 1);
+ +
+ +      /*
+ +       * Now that we've end io'd, which may have put stripes on the LRU list
+ +       * to allow for shrinking, we resize the stripe cache if requested.
+ +       */
+ +      do_sc_resize(rs);
+ +
+ +      /* Try to recover regions. */
+ +      r = do_recovery(rs);
+ +
+ +      /* Quickly grab all new ios queued and add them to the work list. */
+ +      mutex_lock(&rs->io.in_lock);
+ +      bio_list_merge(ios, ios_in);
+ +      bio_list_init(ios_in);
+ +      mutex_unlock(&rs->io.in_lock);
+ +
+ +      if (!bio_list_empty(ios))
+ +              do_ios(rs, ios); /* Got ios to work into the cache. */
+ +
+ +      r = do_flush(rs);               /* Flush any stripes on io list. */
+ +
+ +      do_busy_event(rs);      /* Check if we got too busy. */
+ +}
+ +
+ +/*
+ + * Callback for region hash to dispatch
+ + * delayed bios queued to recovered regions
+ + * (gets called via dm_rh_update_states()).
+ + */
+ +static void dispatch_delayed_bios(void *context, struct bio_list *bl)
+ +{
+ +      struct raid_set *rs = context;
+ +      struct bio *bio;
+ +
+ +      /* REMOVEME: statistics; decrement pending delayed bios counter. */
+ +      bio_list_for_each(bio, bl)
+ +              atomic_dec(rs->stats + S_DELAYED_BIOS);
+ +
+ +      /* Merge region hash private list to work list. */
+ +      bio_list_merge_head(&rs->io.work, bl);
+ +      bio_list_init(bl);
+ +      ClearRSBandwidth(rs);
+ +}
+ +
+ +/*************************************************************
+ + * Constructor helpers
+ + *************************************************************/
+ +/* Calculate MB/sec. */
+ +static unsigned mbpers(struct raid_set *rs, unsigned io_size)
+ +{
+ +      return to_bytes((rs->xor.speed * rs->set.data_devs *
+ +                       io_size * HZ / XOR_SPEED_TICKS) >> 10) >> 10;
+ +}
+ +
+ +/*
+ + * Discover fastest xor algorithm and # of chunks combination.
+ + */
+ +/* Calculate speed of particular algorithm and # of chunks. */
+ +static unsigned xor_speed(struct stripe *stripe)
+ +{
+ +      int ticks = XOR_SPEED_TICKS;
+ +      unsigned p = RS(stripe->sc)->set.raid_devs, r = 0;
+ +      unsigned long j;
+ +
+ +      /* Set uptodate so that common_xor()->xor() will belabour chunks. */
+ +      while (p--)
+ +              SetChunkUptodate(CHUNK(stripe, p));
+ +
+ +      /* Wait for next tick. */
+ +      for (j = jiffies; j == jiffies; );
+ +
+ +      /* Do xors for a few ticks. */
+ +      while (ticks--) {
+ +              unsigned xors = 0;
+ +
+ +              for (j = jiffies; j == jiffies; ) {
+ +                      mb();
+ +                      common_xor(stripe, stripe->io.size, 0, 0);
+ +                      mb();
+ +                      xors++;
+ +                      mb();
+ +              }
+ +
+ +              if (xors > r)
+ +                      r = xors;
+ +      }
+ +
+ +      return r;
+ +}
+ +
+ +/* Define for xor multi recovery stripe optimization runs. */
+ +#define DMRAID45_XOR_TEST
+ +
+ +/* Optimize xor algorithm for this RAID set. */
+ +static unsigned xor_optimize(struct raid_set *rs)
+ +{
+ +      unsigned chunks_max = 2, speed_max = 0;
+ +      struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
+ +      struct stripe *stripe;
+ +      unsigned io_size = 0, speed_hm = 0, speed_min = ~0, speed_xor_blocks = 0;
+ +
+ +      BUG_ON(list_empty(&rs->recover.stripes));
+ +#ifndef DMRAID45_XOR_TEST
+ +      stripe = list_first_entry(&rs->recover.stripes, struct stripe,
+ +                                lists[LIST_RECOVER]);
+ +#endif
+ +
+ +      /* Try all xor functions. */
+ +      while (f-- > xor_funcs) {
+ +              unsigned speed;
+ +
+ +#ifdef DMRAID45_XOR_TEST
+ +              list_for_each_entry(stripe, &rs->recover.stripes,
+ +                                  lists[LIST_RECOVER]) {
+ +                      io_size = stripe->io.size;
+ +#endif
+ +
+ +                      /* Set actual xor function for common_xor(). */
+ +                      rs->xor.f = f;
+ +                      rs->xor.chunks = (f->f == xor_blocks_wrapper ?
+ +                                        (MAX_XOR_BLOCKS + 1) :
+ +                                        XOR_CHUNKS_MAX);
+ +                      if (rs->xor.chunks > rs->set.raid_devs)
+ +                              rs->xor.chunks = rs->set.raid_devs;
+ +
+ +                      for ( ; rs->xor.chunks > 1; rs->xor.chunks--) {
+ +                              speed = xor_speed(stripe);
+ +
+ +#ifdef DMRAID45_XOR_TEST
+ +                              if (f->f == xor_blocks_wrapper) {
+ +                                      if (speed > speed_xor_blocks)
+ +                                              speed_xor_blocks = speed;
+ +                              } else if (speed > speed_hm)
+ +                                      speed_hm = speed;
+ +
+ +                              if (speed < speed_min)
+ +                                      speed_min = speed;
+ +#endif
+ +
+ +                              if (speed > speed_max) {
+ +                                      speed_max = speed;
+ +                                      chunks_max = rs->xor.chunks;
+ +                                      f_max = f;
+ +                              }
+ +                      }
+ +#ifdef DMRAID45_XOR_TEST
+ +              }
+ +#endif
+ +      }
+ +
+ +      /* Memorize optimal parameters. */
+ +      rs->xor.f = f_max;
+ +      rs->xor.chunks = chunks_max;
+ +#ifdef DMRAID45_XOR_TEST
+ +      DMINFO("%s stripes=%u/size=%u min=%u xor_blocks=%u hm=%u max=%u",
+ +             speed_max == speed_hm ? "HM" : "NB",
+ +             rs->recover.recovery_stripes, io_size, speed_min,
+ +             speed_xor_blocks, speed_hm, speed_max);
+ +#endif
+ +      return speed_max;
+ +}
+ +
+ +/*
+ + * Allocate a RAID context (a RAID set)
+ + */
+ +/* Structure for variable RAID parameters. */
+ +struct variable_parms {
+ +      int bandwidth;
+ +      int bandwidth_parm;
+ +      int chunk_size;
+ +      int chunk_size_parm;
+ +      int io_size;
+ +      int io_size_parm;
+ +      int stripes;
+ +      int stripes_parm;
+ +      int recover_io_size;
+ +      int recover_io_size_parm;
+ +      int raid_parms;
+ +      int recovery;
+ +      int recovery_stripes;
+ +      int recovery_stripes_parm;
+ +};
+ +
+ +static struct raid_set *
+ +context_alloc(struct raid_type *raid_type, struct variable_parms *p,
+ +            unsigned raid_devs, sector_t sectors_per_dev,
+ +            struct dm_target *ti, unsigned dl_parms, char **argv)
+ +{
+ +      int r;
+ +      size_t len;
+ +      sector_t region_size, ti_len;
+ +      struct raid_set *rs = NULL;
+ +      struct dm_dirty_log *dl;
+ +      struct recover *rec;
+ +
+ +      /*
+ +       * Create the dirty log
+ +       *
+ +       * We need to change length for the dirty log constructor,
+ +       * because we want an amount of regions for all stripes derived
+ +       * from the single device size, so that we can keep region
+ +       * size = 2^^n independant of the number of devices
+ +       */
+ +      ti_len = ti->len;
+ +      ti->len = sectors_per_dev;
+ +      dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
+ +      ti->len = ti_len;
+ +      if (!dl)
+ +              goto bad_dirty_log;
+ +
+ +      /* Chunk size *must* be smaller than region size. */
+ +      region_size = dl->type->get_region_size(dl);
+ +      if (p->chunk_size > region_size)
+ +              goto bad_chunk_size;
+ +
+ +      /* Recover io size *must* be smaller than region size as well. */
+ +      if (p->recover_io_size > region_size)
+ +              goto bad_recover_io_size;
+ +
+ +      /* Size and allocate the RAID set structure. */
+ +      len = sizeof(*rs->data) + sizeof(*rs->dev);
+ +      if (dm_array_too_big(sizeof(*rs), len, raid_devs))
+ +              goto bad_array;
+ +
+ +      len = sizeof(*rs) + raid_devs * len;
+ +      rs = kzalloc(len, GFP_KERNEL);
+ +      if (!rs)
+ +              goto bad_alloc;
+ +
+ +      rec = &rs->recover;
+ +      atomic_set(&rs->io.in_process, 0);
+ +      atomic_set(&rs->io.in_process_max, 0);
+ +      rec->io_size = p->recover_io_size;
+ +
+ +      /* Pointer to data array. */
+ +      rs->data = (unsigned long **)
+ +                 ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
+ +      rec->dl = dl;
+ +      rs->set.raid_devs = raid_devs;
+ +      rs->set.data_devs = raid_devs - raid_type->parity_devs;
+ +      rs->set.raid_type = raid_type;
+ +
+ +      rs->set.raid_parms = p->raid_parms;
+ +      rs->set.chunk_size_parm = p->chunk_size_parm;
+ +      rs->set.io_size_parm = p->io_size_parm;
+ +      rs->sc.stripes_parm = p->stripes_parm;
+ +      rec->io_size_parm = p->recover_io_size_parm;
+ +      rec->bandwidth_parm = p->bandwidth_parm;
+ +      rec->recovery = p->recovery;
+ +      rec->recovery_stripes = p->recovery_stripes;
+ +
+ +      /*
+ +       * Set chunk and io size and respective shifts
+ +       * (used to avoid divisions)
+ +       */
+ +      rs->set.chunk_size = p->chunk_size;
+ +      rs->set.chunk_shift = ffs(p->chunk_size) - 1;
+ +
+ +      rs->set.io_size = p->io_size;
+ +      rs->set.io_mask = p->io_size - 1;
+ +      /* Mask to adjust address key in case io_size != chunk_size. */
+ +      rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask;
+ +
+ +      rs->set.sectors_per_dev = sectors_per_dev;
+ +
+ +      rs->set.ei = -1;        /* Indicate no failed device. */
+ +      atomic_set(&rs->set.failed_devs, 0);
+ +
+ +      rs->ti = ti;
+ +
+ +      atomic_set(rec->io_count + IO_WORK, 0);
+ +      atomic_set(rec->io_count + IO_RECOVER, 0);
+ +
+ +      /* Initialize io lock and queues. */
+ +      mutex_init(&rs->io.in_lock);
+ +      mutex_init(&rs->io.xor_lock);
+ +      bio_list_init(&rs->io.in);
+ +      bio_list_init(&rs->io.work);
+ +
+ +      init_waitqueue_head(&rs->io.suspendq);  /* Suspend waiters (dm-io). */
+ +
+ +      rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
+ +      rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios,
+ +                      wake_dummy, wake_do_raid, 0, p->recovery_stripes,
+ +                      dl, region_size, rec->nr_regions);
+ +      if (IS_ERR(rec->rh))
+ +              goto bad_rh;
+ +
+ +      /* Initialize stripe cache. */
+ +      r = sc_init(rs, p->stripes);
+ +      if (r)
+ +              goto bad_sc;
+ +
+ +      /* REMOVEME: statistics. */
+ +      stats_reset(rs);
+ +      ClearRSDevelStats(rs);  /* Disnable development status. */
+ +      return rs;
+ +
+ +bad_dirty_log:
+ +      TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM));
+ +
+ +bad_chunk_size:
+ +      dm_dirty_log_destroy(dl);
+ +      TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL));
+ +
+ +bad_recover_io_size:
+ +      dm_dirty_log_destroy(dl);
+ +      TI_ERR_RET("Recover stripe io size larger than region size",
+ +                      ERR_PTR(-EINVAL));
+ +
+ +bad_array:
+ +      dm_dirty_log_destroy(dl);
+ +      TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL));
+ +
+ +bad_alloc:
+ +      dm_dirty_log_destroy(dl);
+ +      TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM));
+ +
+ +bad_rh:
+ +      dm_dirty_log_destroy(dl);
+ +      ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
+ +      goto free_rs;
+ +
+ +bad_sc:
+ +      dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */
+ +      sc_exit(&rs->sc);
+ +      ti->error = DM_MSG_PREFIX "Error creating stripe cache";
+ +free_rs:
+ +      kfree(rs);
+ +      return ERR_PTR(-ENOMEM);
+ +}
+ +
+ +/* Free a RAID context (a RAID set). */
+ +static void context_free(struct raid_set *rs, unsigned p)
+ +{
+ +      while (p--)
+ +              dm_put_device(rs->ti, rs->dev[p].dev);
+ +
+ +      sc_exit(&rs->sc);
+ +      dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */
+ +      kfree(rs);
+ +}
+ +
+ +/* Create work queue and initialize delayed work. */
+ +static int rs_workqueue_init(struct raid_set *rs)
+ +{
+ +      struct dm_target *ti = rs->ti;
+ +
+ +      rs->io.wq = create_singlethread_workqueue(DAEMON);
+ +      if (!rs->io.wq)
+ +              TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
+ +
+ +      INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid);
+ +      INIT_WORK(&rs->io.ws_do_table_event, do_table_event);
+ +      return 0;
+ +}
+ +
+ +/* Return pointer to raid_type structure for raid name. */
+ +static struct raid_type *get_raid_type(char *name)
+ +{
+ +      struct raid_type *r = ARRAY_END(raid_types);
+ +
+ +      while (r-- > raid_types) {
+ +              if (!strcmp(r->name, name))
+ +                      return r;
+ +      }
+ +
+ +      return NULL;
+ +}
+ +
+ +/* FIXME: factor out to dm core. */
+ +static int multiple(sector_t a, sector_t b, sector_t *n)
+ +{
+ +      sector_t r = a;
+ +
+ +      sector_div(r, b);
+ +      *n = r;
+ +      return a == r * b;
+ +}
+ +
+ +/* Log RAID set information to kernel log. */
+ +static void rs_log(struct raid_set *rs, unsigned io_size)
+ +{
+ +      unsigned p;
+ +      char buf[BDEVNAME_SIZE];
+ +
+ +      for (p = 0; p < rs->set.raid_devs; p++)
+ +              DMINFO("/dev/%s is raid disk %u%s",
+ +                              bdevname(rs->dev[p].dev->bdev, buf), p,
+ +                              (p == rs->set.pi) ? " (parity)" : "");
+ +
+ +      DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
+ +             "algorithm \"%s\", %u chunks with %uMB/s\n"
+ +             "%s set with net %u/%u devices",
+ +             rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
+ +             atomic_read(&rs->sc.stripes),
+ +             rs->xor.f->name, rs->xor.chunks, mbpers(rs, io_size),
+ +             rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs);
+ +}
+ +
+ +/* Get all devices and offsets. */
+ +static int dev_parms(struct raid_set *rs, char **argv, int *p)
+ +{
+ +      struct dm_target *ti = rs->ti;
+ +
+ +DMINFO("rs->set.sectors_per_dev=%llu", (unsigned long long) rs->set.sectors_per_dev);
+ +      for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
+ +              int r;
+ +              unsigned long long tmp;
+ +              struct raid_dev *dev = rs->dev + *p;
+ +
+ +              /* Get offset and device. */
+ +              if (sscanf(argv[1], "%llu", &tmp) != 1 ||
+ +                  tmp > rs->set.sectors_per_dev)
+ +                      TI_ERR("Invalid RAID device offset parameter");
+ +
+ +              dev->start = tmp;
+ +              r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
+ +                                &dev->dev);
+ +              if (r)
+ +                      TI_ERR_RET("RAID device lookup failure", r);
+ +
+ +              r = raid_dev_lookup(rs, dev);
+ +              if (r != -ENODEV && r < *p) {
+ +                      (*p)++; /* Ensure dm_put_device() on actual device. */
+ +                      TI_ERR_RET("Duplicate RAID device", -ENXIO);
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Set recovery bandwidth. */
+ +static void
+ +recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
+ +{
+ +      rs->recover.bandwidth = bandwidth;
+ +      rs->recover.bandwidth_work = 100 / bandwidth;
+ +}
+ +
+ +/* Handle variable number of RAID parameters. */
+ +static int get_raid_variable_parms(struct dm_target *ti, char **argv,
+ +                                 struct variable_parms *vp)
+ +{
+ +      int p, value;
+ +      struct {
+ +              int action; /* -1: skip, 0: no power2 check, 1: power2 check */
+ +              char *errmsg;
+ +              int min, max;
+ +              int *var, *var2, *var3;
+ +      } argctr[] = {
+ +              { 1,
+ +                "Invalid chunk size; must be -1 or 2^^n and <= 16384",
+ +                IO_SIZE_MIN, CHUNK_SIZE_MAX,
+ +                &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size },
+ +              { 0,
+ +                "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
+ +                STRIPES_MIN, STRIPES_MAX,
+ +                &vp->stripes_parm, &vp->stripes, NULL },
+ +              { 1,
+ +                "Invalid io size; must -1 or >= 8, 2^^n and less equal "
+ +                "min(BIO_MAX_SECTORS/2, chunk size)",
+ +                IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */
+ +                &vp->io_size_parm, &vp->io_size, NULL },
+ +              { 1,
+ +                "Invalid recovery io size; must be -1 or "
+ +                "2^^n and less equal BIO_MAX_SECTORS/2",
+ +                RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2,
+ +                &vp->recover_io_size_parm, &vp->recover_io_size, NULL },
+ +              { 0,
+ +                "Invalid recovery bandwidth percentage; "
+ +                "must be -1 or > 0 and <= 100",
+ +                BANDWIDTH_MIN, BANDWIDTH_MAX,
+ +                &vp->bandwidth_parm, &vp->bandwidth, NULL },
+ +              /* Handle sync argument seperately in loop. */
+ +              { -1,
+ +                "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
+ +              { 0,
+ +                "Invalid number of recovery stripes;"
+ +                "must be -1, > 0 and <= 64",
+ +                RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
+ +                &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
+ +      }, *varp;
+ +
+ +      /* Fetch # of variable raid parameters. */
+ +      if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 ||
+ +          !range_ok(vp->raid_parms, 0, 7))
+ +              TI_ERR("Bad variable raid parameters number");
+ +
+ +      /* Preset variable RAID parameters. */
+ +      vp->chunk_size = CHUNK_SIZE_DEFAULT;
+ +      vp->io_size = IO_SIZE_DEFAULT;
+ +      vp->stripes = STRIPES_DEFAULT;
+ +      vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT;
+ +      vp->bandwidth = BANDWIDTH_DEFAULT;
+ +      vp->recovery = 1;
+ +      vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT;
+ +
+ +      /* Walk the array of argument constraints for all given ones. */
+ +      for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) {
+ +              BUG_ON(varp >= ARRAY_END(argctr));
+ +
+ +              /* Special case for "[no]sync" string argument. */
+ +              if (varp->action < 0) {
+ +                      if (!strcmp(*argv, "sync"))
+ +                              ;
+ +                      else if (!strcmp(*argv, "nosync"))
+ +                              vp->recovery = 0;
+ +                      else
+ +                              TI_ERR(varp->errmsg);
+ +
+ +                      argv++;
+ +                      continue;
+ +              }
+ +
+ +              /*
+ +               * Special case for io_size depending
+ +               * on previously set chunk size.
+ +               */
+ +              if (p == 2)
+ +                      varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size);
+ +
+ +              if (sscanf(*(argv++), "%d", &value) != 1 ||
+ +                  (value != -1 &&
+ +                   ((varp->action && !is_power_of_2(value)) ||
+ +                    !range_ok(value, varp->min, varp->max))))
+ +                      TI_ERR(varp->errmsg);
+ +
+ +              *varp->var = value;
+ +              if (value != -1) {
+ +                      if (varp->var2)
+ +                              *varp->var2 = value;
+ +                      if (varp->var3)
+ +                              *varp->var3 = value;
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/* Parse optional locking parameters. */
+ +static int get_raid_locking_parms(struct dm_target *ti, char **argv,
+ +                                int *locking_parms,
+ +                                struct dm_raid45_locking_type **locking_type)
+ +{
+ +      if (!strnicmp(argv[0], "locking", strlen(argv[0]))) {
+ +              char *lckstr = argv[1];
+ +              size_t lcksz = strlen(lckstr);
+ +
+ +              if (!strnicmp(lckstr, "none", lcksz)) {
+ +                      *locking_type = &locking_none;
+ +                      *locking_parms = 2;
+ +              } else if (!strnicmp(lckstr, "cluster", lcksz)) {
+ +                      DMERR("locking type \"%s\" not yet implemented",
+ +                            lckstr);
+ +                      return -EINVAL;
+ +              } else {
+ +                      DMERR("unknown locking type \"%s\"", lckstr);
+ +                      return -EINVAL;
+ +              }
+ +      }
+ +
+ +      *locking_parms = 0;
+ +      *locking_type = &locking_none;
+ +      return 0;
+ +}
+ +
+ +/* Set backing device read ahead properties of RAID set. */
+ +static void rs_set_read_ahead(struct raid_set *rs,
+ +                            unsigned sectors, unsigned stripes)
+ +{
+ +      unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE);
+ +      struct mapped_device *md = dm_table_get_md(rs->ti->table);
+ +      struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
+ +
+ +      /* Set read-ahead for the RAID set and the component devices. */
+ +      if (ra_pages) {
+ +              unsigned p = rs->set.raid_devs;
+ +
+ +              bdi->ra_pages = stripes * ra_pages * rs->set.data_devs;
+ +
+ +              while (p--) {
+ +                      struct request_queue *q =
+ +                              bdev_get_queue(rs->dev[p].dev->bdev);
+ +
+ +                      q->backing_dev_info.ra_pages = ra_pages;
+ +              }
+ +      }
+ +}
+ +
+ +/* Set congested function. */
+ +static void rs_set_congested_fn(struct raid_set *rs)
+ +{
+ +      struct mapped_device *md = dm_table_get_md(rs->ti->table);
+ +      struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
+ +
+ +      /* Set congested function and data. */
+ +      bdi->congested_fn = rs_congested;
+ +      bdi->congested_data = rs;
+ +}
+ +
+ +/*
+ + * Construct a RAID4/5 mapping:
+ + *
+ + * log_type #log_params <log_params> \
+ + * raid_type [#parity_dev] #raid_variable_params <raid_params> \
+ + * [locking "none"/"cluster"]
+ + * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
+ + *
+ + * log_type = "core"/"disk",
+ + * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
+ + * log_params = [dirty_log_path] region_size [[no]sync])
+ + *
+ + * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
+ + *
+ + * #parity_dev = N if raid_type = "raid4"
+ + * o N = -1: pick default = last device
+ + * o N >= 0 and < #raid_devs: parity device index
+ + *
+ + * #raid_variable_params = 0-7; raid_params (-1 = default):
+ + *   [chunk_size [#stripes [io_size [recover_io_size \
+ + *    [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
+ + *   o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
+ + *     and <= CHUNK_SIZE_MAX)
+ + *   o #stripes is number of stripes allocated to stripe cache
+ + *     (must be > 1 and < STRIPES_MAX)
+ + *   o io_size (io unit size per device in sectors; must be 2^^n and > 8)
+ + *   o recover_io_size (io unit size per device for recovery in sectors;
+ + must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
+ + *   o %recovery_bandwith is the maximum amount spend for recovery during
+ + *     application io (1-100%)
+ + *   o recovery switch = [sync|nosync]
+ + *   o #recovery_stripes is the number of recovery stripes used for
+ + *     parallel recovery of the RAID set
+ + * If raid_variable_params = 0, defaults will be used.
+ + * Any raid_variable_param can be set to -1 to apply a default
+ + *
+ + * #raid_devs = N (N >= 3)
+ + *
+ + * #dev_to_initialize = N
+ + * -1: initialize parity on all devices
+ + * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
+ + * of a failed devices content after replacement
+ + *
+ + * <dev_path> = device_path (eg, /dev/sdd1)
+ + * <offset>   = begin at offset on <dev_path>
+ + *
+ + */
+ +#define       MIN_PARMS       13
+ +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+ +{
+ +      int dev_to_init, dl_parms, i, locking_parms,
+ +          parity_parm, pi = -1, r, raid_devs;
+ +      sector_t tmp, sectors_per_dev;
+ +      struct dm_raid45_locking_type *locking;
+ +      struct raid_set *rs;
+ +      struct raid_type *raid_type;
+ +      struct variable_parms parms;
+ +
+ +      /* Ensure minimum number of parameters. */
+ +      if (argc < MIN_PARMS)
+ +              TI_ERR("Not enough parameters");
+ +
+ +      /* Fetch # of dirty log parameters. */
+ +      if (sscanf(argv[1], "%d", &dl_parms) != 1 ||
+ +          !range_ok(dl_parms, 1, 4711)) /* ;-) */
+ +              TI_ERR("Bad dirty log parameters number");
+ +
+ +      /* Check raid_type. */
+ +      raid_type = get_raid_type(argv[dl_parms + 2]);
+ +      if (!raid_type)
+ +              TI_ERR("Bad raid type");
+ +
+ +      /* In case of RAID4, parity drive is selectable. */
+ +      parity_parm = !!(raid_type->level == raid4);
+ +
+ +      /* Handle variable number of RAID parameters. */
+ +      r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3,
+ +                                  &parms);
+ +      if (r)
+ +              return r;
+ +
+ +      /* Handle any locking parameters. */
+ +      r = get_raid_locking_parms(ti,
+ +                                 argv + dl_parms + parity_parm +
+ +                                 parms.raid_parms + 4,
+ +                                 &locking_parms, &locking);
+ +      if (r)
+ +              return r;
+ +
+ +      /* # of raid devices. */
+ +      i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4;
+ +      if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
+ +          raid_devs < raid_type->minimal_devs)
+ +              TI_ERR("Invalid number of raid devices");
+ +
+ +      /* In case of RAID4, check parity drive index is in limits. */
+ +      if (raid_type->level == raid4) {
+ +              /* Fetch index of parity device. */
+ +              if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
+ +                  (pi != -1 && !range_ok(pi, 0, raid_devs - 1)))
+ +                      TI_ERR("Invalid RAID4 parity device index");
+ +      }
+ +
+ +      /*
+ +       * Index of device to initialize starts at 0
+ +       *
+ +       * o -1 -> don't initialize a selected device;
+ +       *         initialize parity conforming to algorithm
+ +       * o 0..raid_devs-1 -> initialize respective device
+ +       *   (used for reconstruction of a replaced device)
+ +       */
+ +      if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms +
+ +                 locking_parms + 5], "%d", &dev_to_init) != 1 ||
+ +          !range_ok(dev_to_init, -1, raid_devs - 1))
+ +              TI_ERR("Invalid number for raid device to initialize");
+ +
+ +      /* Check # of raid device arguments. */
+ +      if (argc - dl_parms - parity_parm - parms.raid_parms - 6 !=
+ +          2 * raid_devs)
+ +              TI_ERR("Wrong number of raid device/offset arguments");
+ +
+ +      /*
+ +       * Check that the table length is devisable
+ +       * w/o rest by (raid_devs - parity_devs)
+ +       */
+ +      if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
+ +                    &sectors_per_dev))
+ +              TI_ERR("Target length not divisible by number of data devices");
+ +
+ +      /*
+ +       * Check that the device size is
+ +       * devisable w/o rest by chunk size
+ +       */
+ +      if (!multiple(sectors_per_dev, parms.chunk_size, &tmp))
+ +              TI_ERR("Device length not divisible by chunk_size");
+ +
+ +      /****************************************************************
+ +       * Now that we checked the constructor arguments ->
+ +       * let's allocate the RAID set
+ +       ****************************************************************/
+ +      rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev,
+ +                         ti, dl_parms, argv);
+ +      if (IS_ERR(rs))
+ +              return PTR_ERR(rs);
+ +
+ +
+ +      rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init;
+ +      rs->set.pi = rs->set.pi_parm = pi;
+ +
+ +      /* Set RAID4 parity drive index. */
+ +      if (raid_type->level == raid4)
+ +              rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
+ +
+ +      recover_set_bandwidth(rs, parms.bandwidth);
+ +
+ +      /* Use locking type to lock stripe access. */
+ +      rs->locking = locking;
+ +
+ +      /* Get the device/offset tupels. */
+ +      argv += dl_parms + 6 + parity_parm + parms.raid_parms;
+ +      r = dev_parms(rs, argv, &i);
+ +      if (r)
+ +              goto err;
+ +
+ +      /* Set backing device information (eg. read ahead). */
+ +      rs_set_read_ahead(rs, 2 * rs->set.chunk_size /* sectors per device */,
+ +                            2 /* # of stripes */);
+ +      rs_set_congested_fn(rs); /* Set congested function. */
+ +      SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
+ +      rs->xor.speed = xor_optimize(rs); /* Select best xor algorithm. */
+ +
+ +      /* Set for recovery of any nosync regions. */
+ +      if (parms.recovery)
+ +              SetRSRecover(rs);
+ +      else {
+ +              /*
+ +               * Need to free recovery stripe(s) here in case
+ +               * of nosync, because xor_optimize uses one.
+ +               */
+ +              set_start_recovery(rs);
+ +              set_end_recovery(rs);
+ +              stripe_recover_free(rs);
+ +      }
+ +
+ +      /*
+ +       * Enable parity chunk creation enformcement for
+ +       * little numbers of array members where it doesn'ti
+ +       * gain us performance to xor parity out and back in as
+ +       * with larger array member numbers.
+ +       */
+ +      if (rs->set.raid_devs <= rs->set.raid_type->minimal_devs + 1)
+ +              SetRSEnforceParityCreation(rs);
+ +
+ +      /*
+ +       * Make sure that dm core only hands maximum io size
+ +       * length down and pays attention to io boundaries.
+ +       */
+ +      ti->split_io = rs->set.io_size;
+ +      ti->private = rs;
+ +
+ +      /* Initialize work queue to handle this RAID set's io. */
+ +      r = rs_workqueue_init(rs);
+ +      if (r)
+ +              goto err;
+ +
+ +      rs_log(rs, rs->recover.io_size); /* Log information about RAID set. */
+ +      return 0;
+ +
+ +err:
+ +      context_free(rs, i);
+ +      return r;
+ +}
+ +
+ +/*
+ + * Destruct a raid mapping
+ + */
+ +static void raid_dtr(struct dm_target *ti)
+ +{
+ +      struct raid_set *rs = ti->private;
+ +
+ +      destroy_workqueue(rs->io.wq);
+ +      context_free(rs, rs->set.raid_devs);
+ +}
+ +
+ +/* Raid mapping function. */
+ +static int raid_map(struct dm_target *ti, struct bio *bio,
+ +                  union map_info *map_context)
+ +{
+ +      /* I don't want to waste stripe cache capacity. */
+ +      if (bio_rw(bio) == READA)
+ +              return -EIO;
+ +      else {
+ +              struct raid_set *rs = ti->private;
+ +
+ +              /*
+ +               * Get io reference to be waiting for to drop
+ +               * to zero on device suspension/destruction.
+ +               */
+ +              io_get(rs);
+ +              bio->bi_sector -= ti->begin;    /* Remap sector. */
+ +
+ +              /* Queue io to RAID set. */
+ +              mutex_lock(&rs->io.in_lock);
+ +              bio_list_add(&rs->io.in, bio);
+ +              mutex_unlock(&rs->io.in_lock);
+ +
+ +              /* Wake daemon to process input list. */
+ +              wake_do_raid(rs);
+ +
+ +              /* REMOVEME: statistics. */
+ +              atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
+ +                                      S_BIOS_READ : S_BIOS_WRITE));
+ +              return DM_MAPIO_SUBMITTED;      /* Handle later. */
+ +      }
+ +}
+ +
+ +/* Device suspend. */
+ +static void raid_presuspend(struct dm_target *ti)
+ +{
+ +      struct raid_set *rs = ti->private;
+ +      struct dm_dirty_log *dl = rs->recover.dl;
+ +
+ +      SetRSSuspend(rs);
+ +
+ +      if (RSRecover(rs))
+ +              dm_rh_stop_recovery(rs->recover.rh);
+ +
+ +      cancel_delayed_work(&rs->io.dws_do_raid);
+ +      flush_workqueue(rs->io.wq);
+ +      wait_ios(rs);   /* Wait for completion of all ios being processed. */
+ +
+ +      if (dl->type->presuspend && dl->type->presuspend(dl))
+ +              /* FIXME: need better error handling. */
+ +              DMWARN("log presuspend failed");
+ +}
+ +
+ +static void raid_postsuspend(struct dm_target *ti)
+ +{
+ +      struct raid_set *rs = ti->private;
+ +      struct dm_dirty_log *dl = rs->recover.dl;
+ +
+ +      if (dl->type->postsuspend && dl->type->postsuspend(dl))
+ +              /* FIXME: need better error handling. */
+ +              DMWARN("log postsuspend failed");
+ +
+ +}
+ +
+ +/* Device resume. */
+ +static void raid_resume(struct dm_target *ti)
+ +{
+ +      struct raid_set *rs = ti->private;
+ +      struct recover *rec = &rs->recover;
+ +      struct dm_dirty_log *dl = rec->dl;
+ +
+ +DMINFO("%s...", __func__);
+ +      if (dl->type->resume && dl->type->resume(dl))
+ +              /* Resume dirty log. */
+ +              /* FIXME: need better error handling. */
+ +              DMWARN("log resume failed");
+ +
+ +      rec->nr_regions_to_recover =
+ +              rec->nr_regions - dl->type->get_sync_count(dl);
+ +
+ +      /* Restart any unfinished recovery. */
+ +      if (RSRecover(rs)) {
+ +              set_start_recovery(rs);
+ +              dm_rh_start_recovery(rec->rh);
+ +      }
+ +
+ +      ClearRSSuspend(rs);
+ +}
+ +
+ +/* Return stripe cache size. */
+ +static unsigned sc_size(struct raid_set *rs)
+ +{
+ +      return to_sector(atomic_read(&rs->sc.stripes) *
+ +                       (sizeof(struct stripe) +
+ +                        (sizeof(struct stripe_chunk) +
+ +                         (sizeof(struct page_list) +
+ +                          to_bytes(rs->set.io_size) *
+ +                          rs->set.raid_devs)) +
+ +                        (rs->recover.end_jiffies ?
+ +                         0 : rs->recover.recovery_stripes *
+ +                         to_bytes(rs->set.raid_devs * rs->recover.io_size))));
+ +}
+ +
+ +/* REMOVEME: status output for development. */
+ +static void raid_devel_stats(struct dm_target *ti, char *result,
+ +                           unsigned *size, unsigned maxlen)
+ +{
+ +      unsigned sz = *size;
+ +      unsigned long j;
+ +      char buf[BDEVNAME_SIZE], *p;
+ +      struct stats_map *sm;
+ +      struct raid_set *rs = ti->private;
+ +      struct recover *rec = &rs->recover;
+ +      struct timespec ts;
+ +
+ +      DMEMIT("%s %s=%u bw=%u\n",
+ +             version, rs->xor.f->name, rs->xor.chunks, rs->recover.bandwidth);
+ +      DMEMIT("act_ios=%d ", io_ref(rs));
+ +      DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max));
+ +      DMEMIT("act_stripes=%d ", sc_active(&rs->sc));
+ +      DMEMIT("act_stripes_max=%d\n",
+ +             atomic_read(&rs->sc.active_stripes_max));
+ +
+ +      for (sm = stats_map; sm < ARRAY_END(stats_map); sm++)
+ +              DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
+ +
+ +      DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off");
+ +      DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size,
+ +             atomic_read(&rs->sc.stripes), rs->set.io_size,
+ +             rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets,
+ +             sc_size(rs));
+ +
+ +      j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
+ +          rec->start_jiffies;
+ +      jiffies_to_timespec(j, &ts);
+ +      sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
+ +      p = strchr(buf, '.');
+ +      p[3] = 0;
+ +
+ +      DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
+ +             (unsigned long long) rec->nr_regions_recovered,
+ +             (unsigned long long) rec->nr_regions_to_recover,
+ +             (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
+ +
+ +      *size = sz;
+ +}
+ +
+ +static int raid_status(struct dm_target *ti, status_type_t type,
+ +                     char *result, unsigned maxlen)
+ +{
+ +      unsigned p, sz = 0;
+ +      char buf[BDEVNAME_SIZE];
+ +      struct raid_set *rs = ti->private;
+ +      struct dm_dirty_log *dl = rs->recover.dl;
+ +      int raid_parms[] = {
+ +              rs->set.chunk_size_parm,
+ +              rs->sc.stripes_parm,
+ +              rs->set.io_size_parm,
+ +              rs->recover.io_size_parm,
+ +              rs->recover.bandwidth_parm,
+ +              -2,
+ +              rs->recover.recovery_stripes,
+ +      };
+ +
+ +      switch (type) {
+ +      case STATUSTYPE_INFO:
+ +              /* REMOVEME: statistics. */
+ +              if (RSDevelStats(rs))
+ +                      raid_devel_stats(ti, result, &sz, maxlen);
+ +
+ +              DMEMIT("%u ", rs->set.raid_devs);
+ +
+ +              for (p = 0; p < rs->set.raid_devs; p++)
+ +                      DMEMIT("%s ",
+ +                             format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
+ +
+ +              DMEMIT("2 ");
+ +              for (p = 0; p < rs->set.raid_devs; p++) {
+ +                      DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D');
+ +
+ +                      if (p == rs->set.pi)
+ +                              DMEMIT("p");
+ +
+ +                      if (p == rs->set.dev_to_init)
+ +                              DMEMIT("i");
+ +              }
+ +
+ +              DMEMIT(" %llu/%llu ",
+ +                    (unsigned long long) dl->type->get_sync_count(dl),
+ +                    (unsigned long long) rs->recover.nr_regions);
+ +
+ +              sz += dl->type->status(dl, type, result+sz, maxlen-sz);
+ +              break;
+ +      case STATUSTYPE_TABLE:
+ +              sz = rs->recover.dl->type->status(rs->recover.dl, type,
+ +                                                result, maxlen);
+ +              DMEMIT("%s %u ", rs->set.raid_type->name, rs->set.raid_parms);
+ +
+ +              for (p = 0; p < rs->set.raid_parms; p++) {
+ +                      if (raid_parms[p] > -2)
+ +                              DMEMIT("%d ", raid_parms[p]);
+ +                      else
+ +                              DMEMIT("%s ", rs->recover.recovery ?
+ +                                            "sync" : "nosync");
+ +              }
+ +
+ +              DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
+ +
+ +              for (p = 0; p < rs->set.raid_devs; p++)
+ +                      DMEMIT("%s %llu ",
+ +                             format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev),
+ +                             (unsigned long long) rs->dev[p].start);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Message interface
+ + */
+ +/* Turn a delta into an absolute value. */
+ +static int _absolute(char *action, int act, int r)
+ +{
+ +      size_t len = strlen(action);
+ +
+ +      if (len < 2)
+ +              len = 2;
+ +
+ +      /* Make delta absolute. */
+ +      if (!strncmp("set", action, len))
+ +              ;
+ +      else if (!strncmp("grow", action, len))
+ +              r += act;
+ +      else if (!strncmp("shrink", action, len))
+ +              r = act - r;
+ +      else
+ +              r = -EINVAL;
+ +
+ +      return r;
+ +}
+ +
+ + /* Change recovery io bandwidth. */
+ +static int bandwidth_change(struct raid_set *rs, int argc, char **argv,
+ +                          enum raid_set_flags flag)
+ +{
+ +      int act = rs->recover.bandwidth, bandwidth;
+ +
+ +      if (argc != 2)
+ +              return -EINVAL;
+ +
+ +      if (sscanf(argv[1], "%d", &bandwidth) == 1 &&
+ +          range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+ +              /* Make delta bandwidth absolute. */
+ +              bandwidth = _absolute(argv[0], act, bandwidth);
+ +
+ +              /* Check range. */
+ +              if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+ +                      recover_set_bandwidth(rs, bandwidth);
+ +                      return 0;
+ +              }
+ +      }
+ +
+ +      return -EINVAL;
+ +}
+ +
+ +/* Set/reset development feature flags. */
+ +static int devel_flags(struct raid_set *rs, int argc, char **argv,
+ +                     enum raid_set_flags flag)
+ +{
+ +      size_t len;
+ +
+ +      if (argc != 1)
+ +              return -EINVAL;
+ +
+ +      len = strlen(argv[0]);
+ +      if (len < 2)
+ +              len = 2;
+ +
+ +      if (!strncmp(argv[0], "on", len))
+ +              return test_and_set_bit(flag, &rs->io.flags) ? -EPERM : 0;
+ +      else if (!strncmp(argv[0], "off", len))
+ +              return test_and_clear_bit(flag, &rs->io.flags) ? 0 : -EPERM;
+ +      else if (!strncmp(argv[0], "reset", len)) {
+ +              if (flag == RS_DEVEL_STATS) {
+ +                      if  (test_bit(flag, &rs->io.flags)) {
+ +                              stats_reset(rs);
+ +                              return 0;
+ +                      } else
+ +                              return -EPERM;
+ +              } else  {
+ +                      set_bit(flag, &rs->io.flags);
+ +                      return 0;
+ +              }
+ +      }
+ +
+ +      return -EINVAL;
+ +}
+ +
+ +/* Resize the stripe cache. */
+ +static int sc_resize(struct raid_set *rs, int argc, char **argv,
+ +                   enum raid_set_flags flag)
+ +{
+ +      int act, stripes;
+ +
+ +      if (argc != 2)
+ +              return -EINVAL;
+ +
+ +      /* Deny permission in case the daemon is still resizing!. */
+ +      if (atomic_read(&rs->sc.stripes_to_set))
+ +              return -EPERM;
+ +
+ +      if (sscanf(argv[1], "%d", &stripes) == 1 &&
+ +          stripes > 0) {
+ +              act = atomic_read(&rs->sc.stripes);
+ +
+ +              /* Make delta stripes absolute. */
+ +              stripes = _absolute(argv[0], act, stripes);
+ +
+ +              /*
+ +               * Check range and that the # of stripes changes.
+ +               * We leave the resizing to the wroker.
+ +               */
+ +              if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) &&
+ +                  stripes != atomic_read(&rs->sc.stripes)) {
+ +                      atomic_set(&rs->sc.stripes_to_set, stripes);
+ +                      wake_do_raid(rs);
+ +                      return 0;
+ +              }
+ +      }
+ +
+ +      return -EINVAL;
+ +}
+ +
+ +/* Change xor algorithm and number of chunks. */
+ +static int xor_set(struct raid_set *rs, int argc, char **argv,
+ +                 enum raid_set_flags flag)
+ +{
+ +      if (argc == 2) {
+ +              int chunks;
+ +              char *algorithm = argv[0];
+ +              struct xor_func *f = ARRAY_END(xor_funcs);
+ +
+ +              if (sscanf(argv[1], "%d", &chunks) == 1 &&
+ +                  range_ok(chunks, 2, XOR_CHUNKS_MAX) &&
+ +                  chunks <= rs->set.raid_devs) {
+ +                      while (f-- > xor_funcs) {
+ +                              if (!strcmp(algorithm, f->name)) {
+ +                                      unsigned io_size = 0;
+ +                                      struct stripe *stripe = stripe_alloc(&rs->sc, rs->sc.mem_cache_client, SC_GROW);
+ +
+ +                                      DMINFO("xor: %s", f->name);
+ +                                      if (f->f == xor_blocks_wrapper &&
+ +                                          chunks > MAX_XOR_BLOCKS + 1) {
+ +                                              DMERR("chunks > MAX_XOR_BLOCKS"
+ +                                                    " + 1");
+ +                                              break;
+ +                                      }
+ +
+ +                                      mutex_lock(&rs->io.xor_lock);
+ +                                      rs->xor.f = f;
+ +                                      rs->xor.chunks = chunks;
+ +                                      rs->xor.speed = 0;
+ +                                      mutex_unlock(&rs->io.xor_lock);
+ +
+ +                                      if (stripe) {
+ +                                              rs->xor.speed = xor_speed(stripe);
+ +                                              io_size = stripe->io.size;
+ +                                              stripe_free(stripe, rs->sc.mem_cache_client);
+ +                                      }
+ +
+ +                                      rs_log(rs, io_size);
+ +                                      return 0;
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      return -EINVAL;
+ +}
+ +
+ +/*
+ + * Allow writes after they got prohibited because of a device failure.
+ + *
+ + * This needs to be called after userspace updated metadata state
+ + * based on an event being thrown during device failure processing.
+ + */
+ +static int allow_writes(struct raid_set *rs, int argc, char **argv,
+ +                      enum raid_set_flags flag)
+ +{
+ +      if (TestClearRSProhibitWrites(rs)) {
+ +DMINFO("%s waking", __func__);
+ +              wake_do_raid(rs);
+ +              return 0;
+ +      }
+ +
+ +      return -EPERM;
+ +}
+ +
+ +/* Parse the RAID message. */
+ +/*
+ + * 'all[ow_writes]'
+ + * 'ba[ndwidth] {se[t],g[row],sh[rink]} #'    # e.g 'ba se 50'
+ + * "o[verwrite]  {on,of[f],r[eset]}'          # e.g. 'o of'
+ + * 'sta[tistics] {on,of[f],r[eset]}'          # e.g. 'stat of'
+ + * 'str[ipecache] {se[t],g[row],sh[rink]} #'  # e.g. 'stripe set 1024'
+ + * 'xor algorithm #chunks'                    # e.g. 'xor xor_8 5'
+ + *
+ + */
+ +static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
+ +{
+ +      if (argc) {
+ +              size_t len = strlen(argv[0]);
+ +              struct raid_set *rs = ti->private;
+ +              struct {
+ +                      const char *name;
+ +                      int (*f) (struct raid_set *rs, int argc, char **argv,
+ +                                enum raid_set_flags flag);
+ +                      enum raid_set_flags flag;
+ +              } msg_descr[] = {
+ +                      { "allow_writes", allow_writes, 0 },
+ +                      { "bandwidth", bandwidth_change, 0 },
+ +                      { "overwrite", devel_flags, RS_CHECK_OVERWRITE },
+ +                      { "statistics", devel_flags, RS_DEVEL_STATS },
+ +                      { "stripe_cache", sc_resize, 0 },
+ +                      { "xor", xor_set, 0 },
+ +              }, *m = ARRAY_END(msg_descr);
+ +
+ +              if (len < 3)
+ +                      len = 3;
+ +
+ +              while (m-- > msg_descr) {
+ +                      if (!strncmp(argv[0], m->name, len))
+ +                              return m->f(rs, argc - 1, argv + 1, m->flag);
+ +              }
+ +
+ +      }
+ +
+ +      return -EINVAL;
+ +}
+ +/*
+ + * END message interface
+ + */
+ +
+ +/* Provide io hints. */
+ +static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
+ +{
+ +      struct raid_set *rs = ti->private;
+ +
+ +      blk_limits_io_min(limits, rs->set.chunk_size);
+ +      blk_limits_io_opt(limits, rs->set.chunk_size * rs->set.data_devs);
+ +}
+ +
+ +static struct target_type raid_target = {
+ +      .name = "raid45",
+ +      .version = {1, 0, 0},
+ +      .module = THIS_MODULE,
+ +      .ctr = raid_ctr,
+ +      .dtr = raid_dtr,
+ +      .map = raid_map,
+ +      .presuspend = raid_presuspend,
+ +      .postsuspend = raid_postsuspend,
+ +      .resume = raid_resume,
+ +      .status = raid_status,
+ +      .message = raid_message,
+ +      .io_hints = raid_io_hints,
+ +};
+ +
+ +static void init_exit(const char *bad_msg, const char *good_msg, int r)
+ +{
+ +      if (r)
+ +              DMERR("Failed to %sregister target [%d]", bad_msg, r);
+ +      else
+ +              DMINFO("%s %s", good_msg, version);
+ +}
+ +
+ +static int __init dm_raid_init(void)
+ +{
+ +      int r = dm_register_target(&raid_target);
+ +
+ +      init_exit("", "initialized", r);
+ +      return r;
+ +}
+ +
+ +static void __exit dm_raid_exit(void)
+ +{
+ +      dm_unregister_target(&raid_target);
+ +      init_exit("un", "exit", 0);
+ +}
+ +
+ +/* Module hooks. */
+ +module_init(dm_raid_init);
+ +module_exit(dm_raid_exit);
+ +
+ +MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
+ +MODULE_AUTHOR("Heinz Mauelshagen <heinzm@redhat.com>");
+ +MODULE_LICENSE("GPL");
+ +MODULE_ALIAS("dm-raid4");
+ +MODULE_ALIAS("dm-raid5");
diff --cc drivers/md/dm-table.c
Simple merge
diff --cc drivers/misc/Kconfig

index 4e349cd,4e007c6..7b82347
--- 1/drivers/misc/Kconfig
--- 2/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@@ -437,7 -424,7 +424,7 @@@ config TI_DAC751
   
   config VMWARE_BALLOON
         tristate "VMware Balloon Driver"
--      depends on X86
++      depends on X86 && !XEN
         help
           This is VMware physical memory management driver which acts
           like a "balloon" that can be inflated to reclaim physical pages
diff --cc drivers/net/Kconfig

index 19f04a3,6c884ef..44c7ed8
--- 1/drivers/net/Kconfig
--- 2/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@@ -2961,9 -2954,9 +2954,9 @@@ config TILE_NE
           To compile this driver as a module, choose M here: the module
           will be called tile_net.
   
--config XEN_NETDEV_FRONTEND
++config PARAVIRT_XEN_NETDEV_FRONTEND
         tristate "Xen network device frontend driver"
--      depends on XEN
++      depends on PARAVIRT_XEN
         select XEN_XENBUS_FRONTEND
         default y
         help
@@@ -2972,15 -2965,15 +2965,15 @@@
           domain 0).
   
           The corresponding Linux backend driver is enabled by the
--        CONFIG_XEN_NETDEV_BACKEND option.
++        PARAVIRT_XEN_NETDEV_BACKEND option.
   
           If you are compiling a kernel for use as Xen guest, you
           should say Y here. To compile this driver as a module, chose
           M here: the module will be called xen-netfront.
   
--config XEN_NETDEV_BACKEND
++config PARAVIRT_XEN_NETDEV_BACKEND
         tristate "Xen backend network device"
--      depends on XEN_BACKEND
++      depends on PARAVIRT_XEN_BACKEND
         help
           This driver allows the kernel to act as a Xen network driver
           domain which exports paravirtual network devices to other
@@@ -2988,7 -2981,7 +2981,7 @@@
           system that implements a compatible front end.
   
           The corresponding Linux frontend driver is enabled by the
--        CONFIG_XEN_NETDEV_FRONTEND configuration option.
++        PARAVIRT_XEN_NETDEV_FRONTEND configuration option.
   
           The backend driver presents a standard network device
           endpoint for each paravirtual network device to the driver
@@@ -3443,7 -3436,7 +3436,7 @@@ config VIRTIO_NE
   
   config VMXNET3
         tristate "VMware VMXNET3 ethernet driver"
--      depends on PCI && INET
++      depends on PCI && INET && !XEN
         help
           This driver supports VMware's vmxnet3 virtual ethernet NIC.
           To compile this driver as a module, choose M here: the
diff --cc drivers/net/Makefile

index 776a478,e5a7375..9df17d7
--- 1/drivers/net/Makefile
--- 2/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@@ -172,8 -171,8 +171,8 @@@ obj-$(CONFIG_PPTP) += pppox.o pptp.
   obj-$(CONFIG_SLIP) += slip.o
   obj-$(CONFIG_SLHC) += slhc.o
   
--obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
--obj-$(CONFIG_XEN_NETDEV_BACKEND) += xen-netback/
++obj-$(CONFIG_PARAVIRT_XEN_NETDEV_FRONTEND) += xen-netfront.o
++obj-$(CONFIG_PARAVIRT_XEN_NETDEV_BACKEND) += xen-netback/
   
   obj-$(CONFIG_DUMMY) += dummy.o
   obj-$(CONFIG_IFB) += ifb.o
diff --cc drivers/net/cxgb3/cxgb3_main.c

index 9081ce0,9108931..ece2f2d
--- 1/drivers/net/cxgb3/cxgb3_main.c
--- 2/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@@ -1891,6 -1907,29 +1907,33 @@@ static int set_pauseparam(struct net_de
         return 0;
   }
   
+ static u32 get_rx_csum(struct net_device *dev)
+ {
+       struct port_info *p = netdev_priv(dev);
+ 
+       return p->rx_offload & T3_RX_CSUM;
+ }
+ 
+ static int set_rx_csum(struct net_device *dev, u32 data)
+ {
+       struct port_info *p = netdev_priv(dev);
+ 
+       if (data) {
+               p->rx_offload |= T3_RX_CSUM;
+       } else {
+               int i;
+ 
++#ifndef CONFIG_XEN
+               p->rx_offload &= ~(T3_RX_CSUM | T3_LRO);
++#else
++              p->rx_offload &= ~(T3_RX_CSUM);
++#endif
+               for (i = p->first_qset; i < p->first_qset + p->nqsets; i++)
+                       set_qset_lro(dev, i, 0);
+       }
+       return 0;
+ }
+ 
   static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e)
   {
         struct port_info *pi = netdev_priv(dev);
@@@ -3257,6 -3304,7 +3308,11 @@@ static int __devinit init_one(struct pc
                 adapter->port[i] = netdev;
                 pi = netdev_priv(netdev);
                 pi->adapter = adapter;
++#ifndef CONFIG_XEN
+               pi->rx_offload = T3_RX_CSUM | T3_LRO;
++#else
++              pi->rx_offload = T3_RX_CSUM;
++#endif
                 pi->port_id = i;
                 netif_carrier_off(netdev);
                 netdev->irq = pdev->irq;
diff --cc drivers/net/cxgb3/sge.c

index 3f562ba,bfa2d56..17f6181
--- 1/drivers/net/cxgb3/sge.c
--- 2/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@@ -59,11 -58,11 +58,24 @@@
    * It must be a divisor of PAGE_SIZE.  If set to 0 FL0 will use sk_buffs
    * directly.
    */
++#ifndef CONFIG_XEN
   #define FL0_PG_CHUNK_SIZE  2048
++#else
++/* Use skbuffs for XEN kernels. LRO is already disabled */
++#define FL0_PG_CHUNK_SIZE  0
++#endif
++
   #define FL0_PG_ORDER 0
   #define FL0_PG_ALLOC_SIZE (PAGE_SIZE << FL0_PG_ORDER)
++
++#ifndef CONFIG_XEN
   #define FL1_PG_CHUNK_SIZE (PAGE_SIZE > 8192 ? 16384 : 8192)
   #define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1)
++#else
++#define FL1_PG_CHUNK_SIZE 0
++#define FL1_PG_ORDER 0
++#endif
++
   #define FL1_PG_ALLOC_SIZE (PAGE_SIZE << FL1_PG_ORDER)
   
   #define SGE_RX_DROP_THRES 16
@@@ -1268,7 -1267,7 +1280,27 @@@ netdev_tx_t t3_eth_xmit(struct sk_buff 
   
         gen = q->gen;
         q->unacked += ndesc;
++#ifdef CONFIG_XEN
++      /*
++       * Some Guest OS clients get terrible performance when they have bad
++       * message size / socket send buffer space parameters.  For instance,
++       * if an application selects an 8KB message size and an 8KB send
++       * socket buffer size.  This forces the application into a single
++       * packet stop-and-go mode where it's only willing to have a single
++       * message outstanding.  The next message is only sent when the
++       * previous message is noted as having been sent.  Until we issue a
++       * kfree_skb() against the TX skb, the skb is charged against the
++       * application's send buffer space.  We only free up TX skbs when we
++       * get a TX credit return from the hardware / firmware which is fairly
++       * lazy about this.  So we request a TX WR Completion Notification on
++       * every TX descriptor in order to accellerate TX credit returns.  See
++       * also the change in handle_rsp_cntrl_info() to free up TX skb's when
++       * we receive the TX WR Completion Notifications ...
++       */
++      compl = F_WR_COMPL;
++#else
         compl = (q->unacked & 8) << (S_WR_COMPL - 3);
++#endif
         q->unacked &= 7;
         pidx = q->pidx;
         q->pidx += ndesc;
@@@ -2177,8 -2176,8 +2209,35 @@@ static inline void handle_rsp_cntrl_inf
   #endif
   
         credits = G_RSPD_TXQ0_CR(flags);
--      if (credits)
++      if (credits) {
                 qs->txq[TXQ_ETH].processed += credits;
++#ifdef CONFIG_XEN
++              /*
++               * In the normal Linux driver t3_eth_xmit() routine, we call
++               * skb_orphan() on unshared TX skb.  This results in a call to
++               * the destructor for the skb which frees up the send buffer
++               * space it was holding down.  This, in turn, allows the
++               * application to make forward progress generating more data
++               * which is important at 10Gb/s.  For Virtual Machine Guest
++               * Operating Systems this doesn't work since the send buffer
++               * space is being held down in the Virtual Machine.  Thus we
++               * need to get the TX skb's freed up as soon as possible in
++               * order to prevent applications from stalling.
++               *
++               * This code is largely copied from the corresponding code in
++               * sge_timer_tx() and should probably be kept in sync with any
++               * changes there.
++               */
++              if (__netif_tx_trylock(qs->tx_q)) {
++                      struct port_info *pi = netdev_priv(qs->netdev);
++                      struct adapter *adap = pi->adapter;
++
++                      reclaim_completed_tx(adap, &qs->txq[TXQ_ETH],
++                              TX_RECLAIM_CHUNK);
++                      __netif_tx_unlock(qs->tx_q);
++              }
++#endif
++      }
   
         credits = G_RSPD_TXQ2_CR(flags);
         if (credits)
diff --cc drivers/net/cxgb3/version.h

index 8bda06e,8bda06e..3e174a5
--- 1/drivers/net/cxgb3/version.h
--- 2/drivers/net/cxgb3/version.h
+++ b/drivers/net/cxgb3/version.h
@@@ -35,7 -35,7 +35,11 @@@
   #define DRV_DESC "Chelsio T3 Network Driver"
   #define DRV_NAME "cxgb3"
   /* Driver version */
++#ifndef CONFIG_XEN
   #define DRV_VERSION "1.1.4-ko"
++#else
++#define DRV_VERSION "1.1.4-xen-ko"
++#endif
   
   /* Firmware version */
   #define FW_VERSION_MAJOR 7
diff --cc drivers/net/ehea/ehea_main.c
Simple merge
diff --cc drivers/net/tulip/tulip_core.c
Simple merge
diff --cc drivers/net/wireless/b43/main.c
Simple merge
diff --cc drivers/net/xen-netback/Makefile

index e346e81,e346e81..e3072eb
--- 1/drivers/net/xen-netback/Makefile
--- 2/drivers/net/xen-netback/Makefile
+++ b/drivers/net/xen-netback/Makefile
@@@ -1,3 -1,3 +1,3 @@@
--obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
++obj-$(CONFIG_PARAVIRT_XEN_NETDEV_BACKEND) := xen-netback.o
   
   xen-netback-y := netback.o xenbus.o interface.o
diff --cc drivers/net/xen-netback/xenbus.c

index 1ce729d,22b8c35..604fd9f
--- 1/drivers/net/xen-netback/xenbus.c
--- 2/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@@ -476,7 -476,7 +476,6 @@@ static const struct xenbus_device_id ne
   
   static struct xenbus_driver netback = {
         .name = "vif",
--      .owner = THIS_MODULE,
         .ids = netback_ids,
         .probe = netback_probe,
         .remove = netback_remove,
diff --cc drivers/net/xen-netfront.c

index d29365a,5c8d9c3..36c03f3
--- 1/drivers/net/xen-netfront.c
--- 2/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@@ -1876,7 -1878,7 +1878,6 @@@ static int __devexit xennet_remove(stru
   
   static struct xenbus_driver netfront_driver = {
         .name = "vif",
--      .owner = THIS_MODULE,
         .ids = netfront_ids,
         .probe = netfront_probe,
         .remove = __devexit_p(xennet_remove),
diff --cc drivers/oprofile/buffer_sync.c

index a3984f4,a3984f4..6e703b6
--- 1/drivers/oprofile/buffer_sync.c
--- 2/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@@ -8,6 -8,6 +8,10 @@@
    * @author Barry Kasindorf
    * @author Robert Richter <robert.richter@amd.com>
    *
++ * Modified by Aravind Menon for Xen
++ * These modifications are:
++ * Copyright (C) 2005 Hewlett-Packard Co.
++ *
    * This is the core of the buffer management. Each
    * CPU buffer is processed and entered into the
    * global event buffer. Such processing is necessary
@@@ -43,6 -43,6 +47,10 @@@ static cpumask_var_t marked_cpus
   static DEFINE_SPINLOCK(task_mortuary);
   static void process_task_mortuary(void);
   
++#ifdef CONFIG_XEN
++static int cpu_current_domain[NR_CPUS];
++#endif
++
   /* Take ownership of the task struct and place it on the
    * list for processing. Only after two full buffer syncs
    * does the task eventually get freed, because by then
@@@ -61,7 -61,7 +69,6 @@@ task_free_notify(struct notifier_block 
         return NOTIFY_OK;
   }
   
--
   /* The task is on its way out. A sync of the buffer means we can catch
    * any remaining samples for this task.
    */
@@@ -144,6 -144,6 +151,13 @@@ static struct notifier_block module_loa
   int sync_start(void)
   {
         int err;
++#ifdef CONFIG_XEN
++      int i;
++
++      for (i = 0; i < NR_CPUS; i++) {
++              cpu_current_domain[i] = COORDINATOR_DOMAIN;
++      }
++#endif
   
         if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL))
                 return -ENOMEM;
@@@ -286,14 -286,14 +300,32 @@@ static void add_cpu_switch(int i
         last_cookie = INVALID_COOKIE;
   }
   
--static void add_kernel_ctx_switch(unsigned int in_kernel)
++static void add_cpu_mode_switch(unsigned int cpu_mode)
   {
         add_event_entry(ESCAPE_CODE);
--      if (in_kernel)
++      switch (cpu_mode) {
++      case CPU_MODE_USER:
++              add_event_entry(USER_ENTER_SWITCH_CODE);
++              break;
++      case CPU_MODE_KERNEL:
                 add_event_entry(KERNEL_ENTER_SWITCH_CODE);
--      else
--              add_event_entry(KERNEL_EXIT_SWITCH_CODE);
++              break;
++      case CPU_MODE_XEN:
++              add_event_entry(XEN_ENTER_SWITCH_CODE);
++              break;
++      default:
++              break;
++      }
++}
++
++#ifdef CONFIG_XEN
++static void add_domain_switch(unsigned long domain_id)
++{
++      add_event_entry(ESCAPE_CODE);
++      add_event_entry(DOMAIN_SWITCH_CODE);
++      add_event_entry(domain_id);
   }
++#endif
   
   static void
   add_user_ctx_switch(struct task_struct const *task, unsigned long cookie)
@@@ -373,12 -373,12 +405,12 @@@ static inline void add_sample_entry(uns
    * for later lookup from userspace. Return 0 on failure.
    */
   static int
--add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
++add_sample(struct mm_struct *mm, struct op_sample *s, int cpu_mode)
   {
         unsigned long cookie;
         off_t offset;
   
--      if (in_kernel) {
++      if (cpu_mode >= CPU_MODE_KERNEL) {
                 add_sample_entry(s->eip, s->event);
                 return 1;
         }
@@@ -503,7 -503,7 +535,7 @@@ void sync_buffer(int cpu
         unsigned long val;
         struct task_struct *new;
         unsigned long cookie = 0;
--      int in_kernel = 1;
++      int cpu_mode = CPU_MODE_KERNEL;
         sync_buffer_state state = sb_buffer_start;
         unsigned int i;
         unsigned long available;
@@@ -515,6 -515,6 +547,13 @@@
   
         add_cpu_switch(cpu);
   
++#ifdef CONFIG_XEN
++      /* We need to assign the first samples in this CPU buffer to the
++         same domain that we were processing at the last sync_buffer */
++      if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN)
++              add_domain_switch(cpu_current_domain[cpu]);
++#endif
++
         op_cpu_buffer_reset(cpu);
         available = op_cpu_buffer_entries(cpu);
   
@@@ -531,10 -531,10 +570,10 @@@
                         }
                         if (flags & KERNEL_CTX_SWITCH) {
                                 /* kernel/userspace switch */
--                              in_kernel = flags & IS_KERNEL;
++                              cpu_mode = flags & CPU_MODE_MASK;
                                 if (state == sb_buffer_start)
                                         state = sb_sample_start;
--                              add_kernel_ctx_switch(flags & IS_KERNEL);
++                              add_cpu_mode_switch(cpu_mode);
                         }
                         if (flags & USER_CTX_SWITCH
                             && op_cpu_buffer_get_data(&entry, &val)) {
@@@ -547,16 -547,16 +586,30 @@@
                                         cookie = get_exec_dcookie(mm);
                                 add_user_ctx_switch(new, cookie);
                         }
++#ifdef CONFIG_XEN
++                      if ((flags & DOMAIN_SWITCH)
++                          && op_cpu_buffer_get_data(&entry, &val)) {
++                              cpu_current_domain[cpu] = val;
++                              add_domain_switch(val);
++                      }
++#endif
                         if (op_cpu_buffer_get_size(&entry))
                                 add_data(&entry, mm);
                         continue;
                 }
   
++#ifdef CONFIG_XEN
++              if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN) {
++                      add_sample_entry(sample->eip, sample->event);
++                      continue;
++              }
++#endif
++
                 if (state < sb_bt_start)
                         /* ignore sample */
                         continue;
   
--              if (add_sample(mm, sample, in_kernel))
++              if (add_sample(mm, sample, cpu_mode))
                         continue;
   
                 /* ignore backtraces if failed to add a sample */
@@@ -567,6 -567,6 +620,12 @@@
         }
         release_mm(mm);
   
++#ifdef CONFIG_XEN
++      /* We reset domain to COORDINATOR at each CPU switch */
++      if (cpu_current_domain[cpu] != COORDINATOR_DOMAIN)
++              add_domain_switch(COORDINATOR_DOMAIN);
++#endif
++
         mark_done(cpu);
   
         mutex_unlock(&buffer_mutex);
diff --cc drivers/oprofile/cpu_buffer.c

index b8ef8dd,b8ef8dd..b5e539e
--- 1/drivers/oprofile/cpu_buffer.c
--- 2/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@@ -8,6 -8,6 +8,10 @@@
    * @author Barry Kasindorf <barry.kasindorf@amd.com>
    * @author Robert Richter <robert.richter@amd.com>
    *
++ * Modified by Aravind Menon for Xen
++ * These modifications are:
++ * Copyright (C) 2005 Hewlett-Packard Co.
++ *
    * Each CPU has a local buffer that stores PC value/event
    * pairs. We also log context switches when we notice them.
    * Eventually each CPU's buffer is processed into the global
@@@ -38,6 -38,6 +42,12 @@@ static void wq_sync_buffer(struct work_
   #define DEFAULT_TIMER_EXPIRE (HZ / 10)
   static int work_enabled;
   
++#ifndef CONFIG_XEN
++#define current_domain COORDINATOR_DOMAIN
++#else
++static int32_t current_domain = COORDINATOR_DOMAIN;
++#endif
++
   unsigned long oprofile_get_cpu_buffer_size(void)
   {
         return oprofile_cpu_buffer_size;
@@@ -75,7 -75,7 +85,7 @@@ int alloc_cpu_buffers(void
                 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
   
                 b->last_task = NULL;
--              b->last_is_kernel = -1;
++              b->last_cpu_mode = -1;
                 b->tracing = 0;
                 b->buffer_size = buffer_size;
                 b->sample_received = 0;
@@@ -180,7 -180,7 +190,7 @@@ unsigned long op_cpu_buffer_entries(in
   
   static int
   op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
--          int is_kernel, struct task_struct *task)
++          int cpu_mode, struct task_struct *task)
   {
         struct op_entry entry;
         struct op_sample *sample;
@@@ -193,16 -193,16 +203,15 @@@
                 flags |= TRACE_BEGIN;
   
         /* notice a switch from user->kernel or vice versa */
--      is_kernel = !!is_kernel;
--      if (cpu_buf->last_is_kernel != is_kernel) {
--              cpu_buf->last_is_kernel = is_kernel;
--              flags |= KERNEL_CTX_SWITCH;
--              if (is_kernel)
--                      flags |= IS_KERNEL;
++      if (cpu_buf->last_cpu_mode != cpu_mode) {
++              cpu_buf->last_cpu_mode = cpu_mode;
++              flags |= KERNEL_CTX_SWITCH | cpu_mode;
         }
   
         /* notice a task switch */
--      if (cpu_buf->last_task != task) {
++      /* if not processing other domain samples */
++      if (cpu_buf->last_task != task &&
++          current_domain == COORDINATOR_DOMAIN) {
                 cpu_buf->last_task = task;
                 flags |= USER_CTX_SWITCH;
         }
@@@ -251,14 -251,14 +260,14 @@@ op_add_sample(struct oprofile_cpu_buffe
   /*
    * This must be safe from any context.
    *
-- * is_kernel is needed because on some architectures you cannot
++ * cpu_mode is needed because on some architectures you cannot
    * tell if you are in kernel or user space simply by looking at
-- * pc. We tag this in the buffer by generating kernel enter/exit
-- * events whenever is_kernel changes
++ * pc. We tag this in the buffer by generating kernel/user (and
++ * xen) enter events whenever cpu_mode changes
    */
   static int
   log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
--         unsigned long backtrace, int is_kernel, unsigned long event,
++         unsigned long backtrace, int cpu_mode, unsigned long event,
            struct task_struct *task)
   {
         struct task_struct *tsk = task ? task : current;
@@@ -269,7 -269,7 +278,7 @@@
                 return 0;
         }
   
--      if (op_add_code(cpu_buf, backtrace, is_kernel, tsk))
++      if (op_add_code(cpu_buf, backtrace, cpu_mode, tsk))
                 goto fail;
   
         if (op_add_sample(cpu_buf, pc, event))
@@@ -416,6 -416,6 +425,20 @@@ void oprofile_add_pc(unsigned long pc, 
         log_sample(cpu_buf, pc, 0, is_kernel, event, NULL);
   }
   
++#ifdef CONFIG_XEN
++/*
++ * This is basically log_sample(b, ESCAPE_CODE, 1, cpu_mode, CPU_TRACE_BEGIN),
++ * as was previously accessible through oprofile_add_pc().
++ */
++void oprofile_add_mode(int cpu_mode)
++{
++      struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
++
++      if (op_add_code(cpu_buf, 1, cpu_mode, current))
++              cpu_buf->sample_lost_overflow++;
++}
++#endif
++
   void oprofile_add_trace(unsigned long pc)
   {
         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
@@@ -440,6 -440,6 +463,28 @@@ fail
         return;
   }
   
++#ifdef CONFIG_XEN
++int oprofile_add_domain_switch(int32_t domain_id)
++{
++      struct op_entry entry;
++      struct op_sample *sample;
++
++      sample = op_cpu_buffer_write_reserve(&entry, 1);
++      if (!sample)
++              return 0;
++
++      sample->eip = ESCAPE_CODE;
++      sample->event = DOMAIN_SWITCH;
++
++      op_cpu_buffer_add_data(&entry, domain_id);
++      op_cpu_buffer_write_commit(&entry);
++
++      current_domain = domain_id;
++
++      return 1;
++}
++#endif
++
   /*
    * This serves to avoid cpu buffer overflow, and makes sure
    * the task mortuary progresses
diff --cc drivers/oprofile/cpu_buffer.h

index e1d097e,e1d097e..07c8976
--- 1/drivers/oprofile/cpu_buffer.h
--- 2/drivers/oprofile/cpu_buffer.h
+++ b/drivers/oprofile/cpu_buffer.h
@@@ -41,7 -41,7 +41,7 @@@ struct op_entry
   struct oprofile_cpu_buffer {
         unsigned long buffer_size;
         struct task_struct *last_task;
--      int last_is_kernel;
++      int last_cpu_mode;
         int tracing;
         unsigned long sample_received;
         unsigned long sample_lost_overflow;
@@@ -63,7 -63,7 +63,7 @@@ static inline void op_cpu_buffer_reset(
   {
         struct oprofile_cpu_buffer *cpu_buf = &per_cpu(op_cpu_buffer, cpu);
   
--      cpu_buf->last_is_kernel = -1;
++      cpu_buf->last_cpu_mode = -1;
         cpu_buf->last_task = NULL;
   }
   
@@@ -113,9 -113,9 +113,13 @@@ int op_cpu_buffer_get_data(struct op_en
   }
   
   /* extra data flags */
--#define KERNEL_CTX_SWITCH     (1UL << 0)
--#define IS_KERNEL             (1UL << 1)
++#define CPU_MODE_USER         0
++#define CPU_MODE_KERNEL               1
++#define CPU_MODE_XEN          2
++#define CPU_MODE_MASK         3
   #define TRACE_BEGIN           (1UL << 2)
   #define USER_CTX_SWITCH               (1UL << 3)
++#define KERNEL_CTX_SWITCH     (1UL << 4)
++#define DOMAIN_SWITCH         (1UL << 5)
   
   #endif /* OPROFILE_CPU_BUFFER_H */
diff --cc drivers/oprofile/event_buffer.h

index a8d5bb3,4e70749..ba3d361
--- 1/drivers/oprofile/event_buffer.h
--- 2/drivers/oprofile/event_buffer.h
+++ b/drivers/oprofile/event_buffer.h
@@@ -30,6 -30,6 +30,9 @@@ void wake_up_buffer_waiter(void)
   #define INVALID_COOKIE ~0UL
   #define NO_COOKIE 0UL
   
++/* Constant used to refer to coordinator domain (Xen) */
++#define COORDINATOR_DOMAIN -1
++
   extern const struct file_operations event_buffer_fops;
   
   /* mutex between sync_cpu_buffers() and the
diff --cc drivers/oprofile/oprof.c

index dccd863,f9bda64..096061c
--- 1/drivers/oprofile/oprof.c
--- 2/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@@ -5,6 -5,6 +5,10 @@@
    * @remark Read the file COPYING
    *
    * @author John Levon <levon@movementarian.org>
++ *
++ * Modified by Aravind Menon for Xen
++ * These modifications are:
++ * Copyright (C) 2005 Hewlett-Packard Co.
    */
   
   #include <linux/kernel.h>
@@@ -35,6 -35,6 +39,34 @@@ static DEFINE_MUTEX(start_mutex)
    */
   static int timer = 0;
   
++#ifdef CONFIG_XEN
++int oprofile_set_active(int active_domains[], unsigned int adomains)
++{
++      int err;
++
++      if (!oprofile_ops.set_active)
++              return -EINVAL;
++
++      mutex_lock(&start_mutex);
++      err = oprofile_ops.set_active(active_domains, adomains);
++      mutex_unlock(&start_mutex);
++      return err;
++}
++
++int oprofile_set_passive(int passive_domains[], unsigned int pdomains)
++{
++      int err;
++
++      if (!oprofile_ops.set_passive)
++              return -EINVAL;
++
++      mutex_lock(&start_mutex);
++      err = oprofile_ops.set_passive(passive_domains, pdomains);
++      mutex_unlock(&start_mutex);
++      return err;
++}
++#endif
++
   int oprofile_setup(void)
   {
         int err;
diff --cc drivers/oprofile/oprof.h

index 177b73d,177b73d..c30af61
--- 1/drivers/oprofile/oprof.h
--- 2/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@@ -40,4 -40,4 +40,7 @@@ void oprofile_timer_exit(void)
   int oprofile_set_ulong(unsigned long *addr, unsigned long val);
   int oprofile_set_timeout(unsigned long time);
   
++int oprofile_set_active(int active_domains[], unsigned int adomains);
++int oprofile_set_passive(int passive_domains[], unsigned int pdomains);
++
   #endif /* OPROF_H */
diff --cc drivers/oprofile/oprofile_files.c

index 89f6345,89f6345..140dc4c
--- 1/drivers/oprofile/oprofile_files.c
--- 2/drivers/oprofile/oprofile_files.c
+++ b/drivers/oprofile/oprofile_files.c
@@@ -5,11 -5,11 +5,17 @@@
    * @remark Read the file COPYING
    *
    * @author John Levon <levon@movementarian.org>
++ *
++ * Modified by Aravind Menon for Xen
++ * These modifications are:
++ * Copyright (C) 2005 Hewlett-Packard Co.
    */
   
   #include <linux/fs.h>
   #include <linux/oprofile.h>
   #include <linux/jiffies.h>
++#include <asm/uaccess.h>
++#include <linux/ctype.h>
   
   #include "event_buffer.h"
   #include "oprofile_stats.h"
@@@ -174,6 -174,6 +180,200 @@@ static const struct file_operations dum
         .llseek         = noop_llseek,
   };
   
++#ifdef CONFIG_XEN
++#include <linux/slab.h>
++
++#define TMPBUFSIZE 512
++
++static unsigned int adomains = 0;
++static int active_domains[MAX_OPROF_DOMAINS + 1];
++static DEFINE_MUTEX(adom_mutex);
++
++static ssize_t adomain_write(struct file * file, char const __user * buf,
++                           size_t count, loff_t * offset)
++{
++      char *tmpbuf;
++      char *startp, *endp;
++      int i;
++      unsigned long val;
++      ssize_t retval = count;
++
++      if (*offset)
++              return -EINVAL;
++      if (count > TMPBUFSIZE - 1)
++              return -EINVAL;
++
++      if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
++              return -ENOMEM;
++
++      if (copy_from_user(tmpbuf, buf, count)) {
++              kfree(tmpbuf);
++              return -EFAULT;
++      }
++      tmpbuf[count] = 0;
++
++      mutex_lock(&adom_mutex);
++
++      startp = tmpbuf;
++      /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
++      for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
++              val = simple_strtoul(startp, &endp, 0);
++              if (endp == startp)
++                      break;
++              while (ispunct(*endp) || isspace(*endp))
++                      endp++;
++              active_domains[i] = val;
++              if (active_domains[i] != val)
++                      /* Overflow, force error below */
++                      i = MAX_OPROF_DOMAINS + 1;
++              startp = endp;
++      }
++      /* Force error on trailing junk */
++      adomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
++
++      kfree(tmpbuf);
++
++      if (adomains > MAX_OPROF_DOMAINS
++          || oprofile_set_active(active_domains, adomains)) {
++              adomains = 0;
++              retval = -EINVAL;
++      }
++
++      mutex_unlock(&adom_mutex);
++      return retval;
++}
++
++static ssize_t adomain_read(struct file * file, char __user * buf,
++                          size_t count, loff_t * offset)
++{
++      char * tmpbuf;
++      size_t len;
++      int i;
++      ssize_t retval;
++
++      if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
++              return -ENOMEM;
++
++      mutex_lock(&adom_mutex);
++
++      len = 0;
++      for (i = 0; i < adomains; i++)
++              len += snprintf(tmpbuf + len,
++                              len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
++                              "%u ", active_domains[i]);
++      WARN_ON(len > TMPBUFSIZE);
++      if (len != 0 && len <= TMPBUFSIZE)
++              tmpbuf[len-1] = '\n';
++
++      mutex_unlock(&adom_mutex);
++
++      retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
++
++      kfree(tmpbuf);
++      return retval;
++}
++
++
++static const struct file_operations active_domain_ops = {
++      .read           = adomain_read,
++      .write          = adomain_write,
++};
++
++static unsigned int pdomains = 0;
++static int passive_domains[MAX_OPROF_DOMAINS];
++static DEFINE_MUTEX(pdom_mutex);
++
++static ssize_t pdomain_write(struct file * file, char const __user * buf,
++                           size_t count, loff_t * offset)
++{
++      char *tmpbuf;
++      char *startp, *endp;
++      int i;
++      unsigned long val;
++      ssize_t retval = count;
++
++      if (*offset)
++              return -EINVAL;
++      if (count > TMPBUFSIZE - 1)
++              return -EINVAL;
++
++      if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
++              return -ENOMEM;
++
++      if (copy_from_user(tmpbuf, buf, count)) {
++              kfree(tmpbuf);
++              return -EFAULT;
++      }
++      tmpbuf[count] = 0;
++
++      mutex_lock(&pdom_mutex);
++
++      startp = tmpbuf;
++      /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
++      for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
++              val = simple_strtoul(startp, &endp, 0);
++              if (endp == startp)
++                      break;
++              while (ispunct(*endp) || isspace(*endp))
++                      endp++;
++              passive_domains[i] = val;
++              if (passive_domains[i] != val)
++                      /* Overflow, force error below */
++                      i = MAX_OPROF_DOMAINS + 1;
++              startp = endp;
++      }
++      /* Force error on trailing junk */
++      pdomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
++
++      kfree(tmpbuf);
++
++      if (pdomains > MAX_OPROF_DOMAINS
++          || oprofile_set_passive(passive_domains, pdomains)) {
++              pdomains = 0;
++              retval = -EINVAL;
++      }
++
++      mutex_unlock(&pdom_mutex);
++      return retval;
++}
++
++static ssize_t pdomain_read(struct file * file, char __user * buf,
++                          size_t count, loff_t * offset)
++{
++      char * tmpbuf;
++      size_t len;
++      int i;
++      ssize_t retval;
++
++      if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
++              return -ENOMEM;
++
++      mutex_lock(&pdom_mutex);
++
++      len = 0;
++      for (i = 0; i < pdomains; i++)
++              len += snprintf(tmpbuf + len,
++                              len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
++                              "%u ", passive_domains[i]);
++      WARN_ON(len > TMPBUFSIZE);
++      if (len != 0 && len <= TMPBUFSIZE)
++              tmpbuf[len-1] = '\n';
++
++      mutex_unlock(&pdom_mutex);
++
++      retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
++
++      kfree(tmpbuf);
++      return retval;
++}
++
++static const struct file_operations passive_domain_ops = {
++      .read           = pdomain_read,
++      .write          = pdomain_write,
++};
++
++#endif /* CONFIG_XEN */
++
   void oprofile_create_files(struct super_block *sb, struct dentry *root)
   {
         /* reinitialize default values */
@@@ -184,6 -184,6 +384,10 @@@
   
         oprofilefs_create_file(sb, root, "enable", &enable_fops);
         oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
++#ifdef CONFIG_XEN
++      oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops);
++      oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops);
++#endif
         oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops);
         oprofilefs_create_ulong(sb, root, "buffer_size", &oprofile_buffer_size);
         oprofilefs_create_ulong(sb, root, "buffer_watershed", &oprofile_buffer_watershed);
diff --cc drivers/pci/Kconfig

index 0fa466a,0fa466a..b11433e
--- 1/drivers/pci/Kconfig
--- 2/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@@ -31,6 -31,6 +31,27 @@@ config PCI_DEBU
   
           When in doubt, say N.
   
++config PCI_GUESTDEV
++      bool "PCI Device Reservation for Passthrough"
++      depends on PCI && ACPI && XEN
++      default y
++      help
++        Say Y here if you want to reserve PCI device for passthrough.
++
++config PCI_IOMULTI
++      tristate "PCI Device IO Multiplex for Passthrough"
++      depends on PCI && ACPI && XEN
++      default y
++      help
++        Say Y here if you need io multiplexing.
++
++config PCI_RESERVE
++      bool "PCI IO/MEMORY space reserve"
++      depends on PCI && XEN_PRIVILEGED_GUEST
++      default y
++      help
++        Say Y here if you need PCI IO/MEMORY space reserve
++
   config PCI_STUB
         tristate "PCI Stub driver"
         depends on PCI
@@@ -40,9 -40,9 +61,9 @@@
   
           When in doubt, say N.
   
--config XEN_PCIDEV_FRONTEND
++config PARAVIRT_XEN_PCIDEV_FRONTEND
           tristate "Xen PCI Frontend"
--        depends on PCI && X86 && XEN
++        depends on PCI && X86 && PARAVIRT_XEN
           select HOTPLUG
           select PCI_XEN
         select XEN_XENBUS_FRONTEND
@@@ -51,9 -51,9 +72,18 @@@
             The PCI device frontend driver allows the kernel to import arbitrary
             PCI devices from a PCI backend to support PCI driver domains.
   
++config XEN_PCIDEV_FRONTEND
++      def_bool y
++      prompt "Xen PCI Frontend" if X86_64 && !XEN_UNPRIVILEGED_GUEST
++      depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
++      select HOTPLUG
++      help
++        The PCI device frontend driver allows the kernel to import arbitrary
++        PCI devices from a PCI backend to support PCI driver domains.
++
   config XEN_PCIDEV_FE_DEBUG
           bool "Xen PCI Frontend debugging"
--        depends on XEN_PCIDEV_FRONTEND && PCI_DEBUG
++        depends on XEN_PCIDEV_FRONTEND || (PARAVIRT_XEN_PCIDEV_FRONTEND && PCI_DEBUG)
         help
           Say Y here if you want the Xen PCI frontend to produce a bunch of debug
           messages to the system log.  Select this if you are having a
@@@ -65,7 -65,7 +95,7 @@@
   config HT_IRQ
         bool "Interrupts on hypertransport devices"
         default y
--      depends on PCI && X86_LOCAL_APIC && X86_IO_APIC
++      depends on PCI && X86_LOCAL_APIC && X86_IO_APIC && !XEN
         help
            This allows native hypertransport devices to use interrupts.
   
@@@ -83,7 -83,7 +113,7 @@@ config PCI_IO
   
   config PCI_IOAPIC
         bool
--      depends on PCI
++      depends on PCI && !XEN
         depends on ACPI
         depends on HOTPLUG
         default y
diff --cc drivers/pci/Makefile

index c85f744,c85f744..c077c81
--- 1/drivers/pci/Makefile
--- 2/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@@ -7,6 -7,6 +7,11 @@@ obj-y           += access.o bus.o probe.o remove
                         irq.o vpd.o
   obj-$(CONFIG_PROC_FS) += proc.o
   obj-$(CONFIG_SYSFS) += slot.o
++obj-$(CONFIG_PCI_GUESTDEV) += guestdev.o
++obj-$(CONFIG_PCI_IOMULTI) += pci-iomul.o
++iomul-$(CONFIG_PCI_IOMULTI) := iomulti.o
++obj-y += $(iomul-y) $(iomul-m)
++obj-$(CONFIG_PCI_RESERVE) += reserve.o
   
   obj-$(CONFIG_PCI_QUIRKS) += quirks.o
   
@@@ -68,6 -68,6 +73,6 @@@ obj-$(CONFIG_PCI_SYSCALL) += syscall.
   
   obj-$(CONFIG_PCI_STUB) += pci-stub.o
   
--obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
++obj-$(CONFIG_PARAVIRT_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
   
   ccflags-$(CONFIG_PCI_DEBUG) := -DDEBUG
diff --cc drivers/pci/guestdev.c

index 0000000,0000000..f15644b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/pci/guestdev.c
@@@ -1,0 -1,0 +1,881 @@@
++/*
++ * Copyright (c) 2008, 2009 NEC Corporation.
++ * Copyright (c) 2009 Isaku Yamahata
++ *                    VA Linux Systems Japan K.K.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms and conditions of the GNU General Public License,
++ * version 2, as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++ * more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++ * Place - Suite 330, Boston, MA 02111-1307 USA.
++ */
++
++#include <linux/kernel.h>
++#include <linux/list.h>
++#include <linux/mm.h>
++#include <linux/pci.h>
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/acpi.h>
++#include <asm/setup.h>
++
++#define HID_LEN 8
++#define UID_LEN 8
++#define DEV_LEN 2
++#define FUNC_LEN 1
++#define DEV_NUM_MAX 31
++#define FUNC_NUM_MAX 7
++#define INVALID_SEG (-1)
++#define INVALID_BBN (-1)
++#define GUESTDEV_STR_MAX 128
++
++#define GUESTDEV_FLAG_TYPE_MASK 0x3
++#define GUESTDEV_FLAG_DEVICEPATH 0x1
++#define GUESTDEV_FLAG_SBDF 0x2
++
++#define GUESTDEV_OPT_IOMUL    0x1
++
++struct guestdev {
++      int flags;
++      int options;
++      struct list_head root_list;
++      union {
++              struct devicepath {
++                      char hid[HID_LEN + 1];
++                      char uid[UID_LEN + 1];
++                      int seg;
++                      int bbn;
++                      struct devicepath_node *child;
++              } devicepath;
++              struct sbdf {
++                      int seg;
++                      int bus;
++                      int dev;
++                      int func;
++              } sbdf;
++      } u;
++};
++
++struct devicepath_node {
++      int dev;
++      int func;
++      struct devicepath_node *child;
++};
++
++struct pcidev_sbdf {
++      int seg;
++      int bus;
++      struct pcidev_sbdf_node *child;
++};
++
++struct pcidev_sbdf_node {
++      int dev;
++      int func;
++      struct pcidev_sbdf_node *child;
++};
++
++static char __initdata guestdev_param[COMMAND_LINE_SIZE];
++static LIST_HEAD(guestdev_list);
++
++/* Get hid and uid */
++static int __init pci_get_hid_uid(char *str, char *hid, char *uid)
++{
++      char *sp, *ep;
++      int len;
++
++      sp = str;
++      ep = strchr(sp, ':');
++      if (!ep) {
++              ep = strchr(sp, '-');
++              if (!ep)
++                      goto format_err_end;
++      }
++      /* hid length */
++      len = ep - sp;
++      if (len <= 0 || HID_LEN < len)
++              goto format_err_end;
++
++      strlcpy(hid, sp, len);
++
++      if (*ep == '-') { /* no uid */
++              uid[0] = '\0';
++              return TRUE;
++      }
++
++      sp = ep + 1;
++      ep = strchr(sp, '-');
++      if (!ep)
++              ep = strchr(sp, '\0');
++
++      /* uid length */
++      len = ep - sp;
++      if (len <= 0 || UID_LEN < len)
++              goto format_err_end;
++
++      strlcpy(uid, sp, len);
++      return TRUE;
++
++format_err_end:
++      return FALSE;
++}
++
++/* Get device and function */
++static int __init pci_get_dev_func(char *str, int *dev, int *func)
++{
++      if (sscanf(str, "%02x.%01x", dev, func) != 2)
++              goto format_err_end;
++
++      if (*dev < 0 || DEV_NUM_MAX < *dev)
++              goto format_err_end;
++
++      if (*func < 0 || FUNC_NUM_MAX < *func)
++              goto format_err_end;
++
++      return TRUE;
++
++format_err_end:
++      return FALSE;
++}
++
++/* Check extended guestdev parameter format error */
++static int __init pci_check_extended_guestdev_format(char *str)
++{
++      int flg;
++      char *p;
++
++      /* Check extended format */
++      if (strpbrk(str, "(|)") == NULL)
++              return TRUE;
++
++      flg = 0;
++      p = str;
++      while (*p) {
++              switch (*p) {
++              case '(':
++                      /* Check nesting error */
++                      if (flg != 0)
++                              goto format_err_end;
++                      flg = 1;
++                      /* Check position of '(' is head or
++                         previos charactor of '(' is not '-'. */
++                      if (p == str || *(p - 1) != '-')
++                              goto format_err_end;
++                      break;
++              case ')':
++                      /* Check nesting error */
++                      if (flg != 1)
++                              goto format_err_end;
++                      flg = 0;
++                      /* Check next charactor of ')' is not '\0' */
++                      if (*(p + 1) != '\0')
++                              goto format_err_end;
++                      break;
++              case '|':
++                      /* Check position of '|' is outside of '(' and ')' */
++                      if (flg != 1)
++                              goto format_err_end;
++                      break;
++              default:
++                      break;
++              }
++              p++;
++      }
++      /* Check number of '(' and ')' are not equal */
++      if (flg != 0)
++              goto format_err_end;
++      return TRUE;
++
++format_err_end:
++      pr_err("PCI: The format of the guestdev parameter is illegal. [%s]\n",
++             str);
++      return FALSE;
++}
++
++/* Make guestdev strings */
++static void pci_make_guestdev_str(struct guestdev *gdev,
++                                      char *gdev_str, int buf_size)
++{
++      struct devicepath_node *node;
++      int count;
++
++      switch (gdev->flags & GUESTDEV_FLAG_TYPE_MASK) {
++      case GUESTDEV_FLAG_DEVICEPATH:
++              memset(gdev_str, 0, buf_size);
++
++              if (strlen(gdev->u.devicepath.uid))
++                      count = snprintf(gdev_str, buf_size, "%s:%s",
++                                              gdev->u.devicepath.hid,
++                                              gdev->u.devicepath.uid);
++              else
++                      count = snprintf(gdev_str, buf_size, "%s",
++                                               gdev->u.devicepath.hid);
++              if (count < 0)
++                      return;
++
++              node = gdev->u.devicepath.child;
++              while (node) {
++                      gdev_str += count;
++                      buf_size -= count;
++                      if (buf_size <= 0)
++                              return;
++                      count = snprintf(gdev_str, buf_size, "-%02x.%01x",
++                              node->dev, node->func);
++                      if (count < 0)
++                              return;
++                      node = node->child;
++              }
++              break;
++      case GUESTDEV_FLAG_SBDF:
++              snprintf(gdev_str, buf_size, "%04x:%02x:%02x.%01x",
++                                      gdev->u.sbdf.seg, gdev->u.sbdf.bus,
++                                      gdev->u.sbdf.dev, gdev->u.sbdf.func);
++              break;
++      default:
++              BUG();
++      }
++}
++
++/* Free guestdev and nodes */
++static void __init pci_free_guestdev(struct guestdev *gdev)
++{
++      struct devicepath_node *node, *next;
++
++      if (!gdev)
++              return;
++      if (gdev->flags & GUESTDEV_FLAG_DEVICEPATH) {
++              node = gdev->u.devicepath.child;
++              while (node) {
++                      next = node->child;
++                      kfree(node);
++                      node = next;
++              }
++      }
++      list_del(&gdev->root_list);
++      kfree(gdev);
++}
++
++/* Copy guestdev and nodes */
++struct guestdev __init *pci_copy_guestdev(struct guestdev *gdev_src)
++{
++      struct guestdev *gdev;
++      struct devicepath_node *node, *node_src, *node_upper;
++
++      BUG_ON(!(gdev_src->flags & GUESTDEV_FLAG_DEVICEPATH));
++
++      gdev = kzalloc(sizeof(*gdev), GFP_KERNEL);
++      if (!gdev)
++              goto allocate_err_end;
++
++      INIT_LIST_HEAD(&gdev->root_list);
++      gdev->flags = gdev_src->flags;
++      gdev->options = gdev_src->options;
++      strcpy(gdev->u.devicepath.hid, gdev_src->u.devicepath.hid);
++      strcpy(gdev->u.devicepath.uid, gdev_src->u.devicepath.uid);
++      gdev->u.devicepath.seg = gdev_src->u.devicepath.seg;
++      gdev->u.devicepath.bbn = gdev_src->u.devicepath.bbn;
++
++      node_upper = NULL;
++
++      node_src = gdev_src->u.devicepath.child;
++      while (node_src) {
++              node = kzalloc(sizeof(*node), GFP_KERNEL);
++              if (!node)
++                      goto allocate_err_end;
++              node->dev = node_src->dev;
++              node->func = node_src->func;
++              if (!node_upper)
++                      gdev->u.devicepath.child = node;
++              else
++                      node_upper->child = node;
++              node_upper = node;
++              node_src = node_src->child;
++      }
++
++      return gdev;
++
++allocate_err_end:
++      if (gdev)
++              pci_free_guestdev(gdev);
++      pr_err("PCI: failed to allocate memory\n");
++      return NULL;
++}
++
++/* Make guestdev from path strings */
++static int __init pci_make_devicepath_guestdev(char *path_str, int options)
++{
++      char hid[HID_LEN + 1], uid[UID_LEN + 1];
++      char *sp, *ep;
++      struct guestdev *gdev, *gdev_org;
++      struct devicepath_node *node, *node_tmp;
++      int dev, func, ret_val;
++
++      ret_val = 0;
++      gdev = gdev_org = NULL;
++      sp = path_str;
++      /* Look for end of hid:uid'-' */
++      ep = strchr(sp, '-');
++      /* Only hid, uid. (No dev, func) */
++      if (!ep)
++              goto format_err_end;
++
++      memset(hid, 0 ,sizeof(hid));
++      memset(uid, 0, sizeof(uid));
++      if (!pci_get_hid_uid(sp, hid, uid))
++              goto format_err_end;
++
++      gdev_org = kzalloc(sizeof(*gdev_org), GFP_KERNEL);
++      if (!gdev_org)
++              goto allocate_err_end;
++      INIT_LIST_HEAD(&gdev_org->root_list);
++      gdev_org->flags = GUESTDEV_FLAG_DEVICEPATH;
++      gdev_org->options = options;
++      strcpy(gdev_org->u.devicepath.hid, hid);
++      strcpy(gdev_org->u.devicepath.uid, uid);
++      gdev_org->u.devicepath.seg = INVALID_SEG;
++      gdev_org->u.devicepath.bbn = INVALID_BBN;
++
++      gdev = gdev_org;
++
++      sp = ep + 1;
++      ep = sp;
++      do {
++              if (*sp == '(') {
++                      sp++;
++                      if (strchr(sp, '|')) {
++                              gdev = pci_copy_guestdev(gdev_org);
++                              if (!gdev) {
++                                      ret_val = -ENOMEM;
++                                      goto end;
++                              }
++                      }
++                      continue;
++              }
++              if (gdev && pci_get_dev_func(sp, &dev, &func)) {
++                      node = kzalloc(sizeof(*node), GFP_KERNEL);
++                      if (!node)
++                              goto allocate_err_end;
++                      node->dev = dev;
++                      node->func = func;
++                      /* add node to end of guestdev */
++                      if (gdev->u.devicepath.child) {
++                              node_tmp = gdev->u.devicepath.child;
++                              while (node_tmp->child) {
++                                      node_tmp = node_tmp->child;
++                              }
++                              node_tmp->child = node;
++                      } else
++                              gdev->u.devicepath.child = node;
++              } else if (gdev) {
++                      pr_err("PCI: Can't obtain dev# and #func# from %s.\n",
++                             sp);
++                      ret_val = -EINVAL;
++                      if (gdev == gdev_org)
++                              goto end;
++                      pci_free_guestdev(gdev);
++                      gdev = NULL;
++              }
++
++              ep = strpbrk(sp, "-|)");
++              if (!ep)
++                      ep = strchr(sp, '\0');
++              /* Is *ep '|' OR ')' OR '\0' ? */
++              if (*ep != '-') {
++                      if (gdev)
++                              list_add_tail(&gdev->root_list, &guestdev_list);
++                      if (*ep == '|') {
++                              /* Between '|' and '|' ? */
++                              if (strchr(ep + 1, '|')) {
++                                      gdev = pci_copy_guestdev(gdev_org);
++                                      if (!gdev) {
++                                              ret_val = -ENOMEM;
++                                              goto end;
++                                      }
++                              } else {
++                                      gdev = gdev_org;
++                                      gdev_org = NULL;
++                              }
++                      } else {
++                              gdev_org = NULL;
++                              gdev = NULL;
++                      }
++              }
++              if (*ep == ')')
++                      ep++;
++              sp = ep + 1;
++      } while (*ep != '\0');
++
++      goto end;
++
++format_err_end:
++      pr_err("PCI: The format of the guestdev parameter is illegal. [%s]\n",
++             path_str);
++      ret_val = -EINVAL;
++      goto end;
++
++allocate_err_end:
++      pr_err("PCI: failed to allocate memory\n");
++      ret_val = -ENOMEM;
++      goto end;
++
++end:
++      if (gdev_org && (gdev_org != gdev))
++              pci_free_guestdev(gdev_org);
++      if (gdev)
++              pci_free_guestdev(gdev);
++      return ret_val;
++}
++
++static int __init pci_make_sbdf_guestdev(char* str, int options)
++{
++      struct guestdev *gdev;
++      int seg, bus, dev, func;
++
++      if (sscanf(str, "%x:%x:%x.%x", &seg, &bus, &dev, &func) != 4) {
++              seg = 0;
++              if (sscanf(str, "%x:%x.%x", &bus, &dev, &func) != 3)
++                      return -EINVAL;
++      }
++      gdev = kmalloc(sizeof(*gdev), GFP_KERNEL);
++      if (!gdev) {
++              pr_err("PCI: failed to allocate memory\n");
++              return -ENOMEM;
++      }
++      INIT_LIST_HEAD(&gdev->root_list);
++      gdev->flags = GUESTDEV_FLAG_SBDF;
++      gdev->options = options;
++      gdev->u.sbdf.seg = seg;
++      gdev->u.sbdf.bus = bus;
++      gdev->u.sbdf.dev = dev;
++      gdev->u.sbdf.func = func;
++      list_add_tail(&gdev->root_list, &guestdev_list);
++      return 0;
++}
++
++static int __init pci_parse_options(const char *str)
++{
++      int options = 0;
++      char *ep;
++
++      while (str) {
++              str++;
++              ep = strchr(str, '+');
++              if (ep)
++                      ep = '\0';      /* Chop */
++
++              if (!strcmp(str, "iomul"))
++                      options |= GUESTDEV_OPT_IOMUL;
++
++              str = ep;
++      }
++      return options;
++}
++
++/* Parse guestdev parameter */
++static int __init pci_parse_guestdev(void)
++{
++      int len;
++      char *sp, *ep, *op;
++      int options;
++      struct list_head *head;
++      struct guestdev *gdev;
++      char path_str[GUESTDEV_STR_MAX];
++      int ret_val = 0;
++
++      len = strlen(guestdev_param);
++      if (len == 0)
++              return 0;
++
++      sp = guestdev_param;
++
++      do {
++              ep = strchr(sp, ',');
++              /* Chop */
++              if (ep)
++                      *ep = '\0';
++              options = 0;
++              op = strchr(sp, '+');
++              if (op && (!ep || op < ep)) {
++                      options = pci_parse_options(op);
++                      *op = '\0';     /* Chop */
++              }
++              ret_val = pci_make_sbdf_guestdev(sp, options);
++              if (ret_val == -EINVAL) {
++                      if (pci_check_extended_guestdev_format(sp)) {
++                              ret_val = pci_make_devicepath_guestdev(
++                                      sp, options);
++                              if (ret_val && ret_val != -EINVAL)
++                                      break;
++                      }
++              } else if (ret_val)
++                      break;
++
++              if (ep)
++                      ep++;
++              sp = ep;
++      } while (ep);
++
++      list_for_each(head, &guestdev_list) {
++              gdev = list_entry(head, struct guestdev, root_list);
++              pci_make_guestdev_str(gdev, path_str, GUESTDEV_STR_MAX);
++              printk(KERN_DEBUG
++                      "PCI: %s has been reserved for guest domain.\n",
++                      path_str);
++      }
++      return 0;
++}
++
++arch_initcall(pci_parse_guestdev);
++
++/* Get command line */
++static int __init pci_guestdev_setup(char *str)
++{
++      if (strlen(str) >= COMMAND_LINE_SIZE)
++              return 0;
++      strlcpy(guestdev_param, str, sizeof(guestdev_param));
++      return 1;
++}
++
++__setup("guestdev=", pci_guestdev_setup);
++
++/* Free sbdf and nodes */
++static void pci_free_sbdf(struct pcidev_sbdf *sbdf)
++{
++      struct pcidev_sbdf_node *node, *next;
++
++      node = sbdf->child;
++      while (node) {
++              next = node->child;
++              kfree(node);
++              node = next;
++      }
++      /* Skip kfree(sbdf) */
++}
++
++/* Does PCI device belong to sub tree specified by guestdev with device path? */
++typedef int (*pci_node_match_t)(const struct devicepath_node *gdev_node,
++                              const struct pcidev_sbdf_node *sbdf_node,
++                              int options);
++
++static int pci_node_match(const struct devicepath_node *gdev_node,
++                        const struct pcidev_sbdf_node *sbdf_node,
++                        int options_unused)
++{
++      return (gdev_node->dev == sbdf_node->dev &&
++              gdev_node->func == sbdf_node->func);
++}
++
++static int pci_is_in_devicepath_sub_tree(struct guestdev *gdev,
++                                       struct pcidev_sbdf *sbdf,
++                                       pci_node_match_t match)
++{
++      int seg, bbn;
++      struct devicepath_node *gdev_node;
++      struct pcidev_sbdf_node *sbdf_node;
++
++      if (!gdev || !sbdf)
++              return FALSE;
++
++      BUG_ON(!(gdev->flags & GUESTDEV_FLAG_DEVICEPATH));
++
++      /* Compare seg and bbn */
++      if (gdev->u.devicepath.seg == INVALID_SEG ||
++          gdev->u.devicepath.bbn == INVALID_BBN) {
++              if (acpi_pci_get_root_seg_bbn(gdev->u.devicepath.hid,
++                  gdev->u.devicepath.uid, &seg, &bbn)) {
++                      gdev->u.devicepath.seg = seg;
++                      gdev->u.devicepath.bbn = bbn;
++              } else
++                      return FALSE;
++      }
++
++      if (gdev->u.devicepath.seg != sbdf->seg ||
++          gdev->u.devicepath.bbn != sbdf->bus)
++              return FALSE;
++
++      gdev_node = gdev->u.devicepath.child;
++      sbdf_node = sbdf->child;
++
++      /* Compare dev and func */
++      while (gdev_node) {
++              if (!sbdf_node)
++                      return FALSE;
++              if (!match(gdev_node, sbdf_node, gdev->options))
++                      return FALSE;
++              gdev_node = gdev_node->child;
++              sbdf_node = sbdf_node->child;
++      }
++      return TRUE;
++}
++
++/* Get sbdf from device */
++static int pci_get_sbdf_from_pcidev(
++      struct pci_dev *dev, struct pcidev_sbdf *sbdf)
++{
++      struct pcidev_sbdf_node *node;
++
++      if (!dev)
++              return FALSE;
++
++      for(;;) {
++              node = kzalloc(sizeof(*node), GFP_KERNEL);
++              if (!node) {
++                      pr_err("PCI: failed to allocate memory\n");
++                      goto err_end;
++              }
++              node->dev = PCI_SLOT(dev->devfn);
++              node->func = PCI_FUNC(dev->devfn);
++
++              if (!sbdf->child)
++                      sbdf->child = node;
++              else {
++                      node->child = sbdf->child;
++                      sbdf->child = node;
++              }
++              if (!dev->bus)
++                      goto err_end;
++              if (!dev->bus->self)
++                      break;
++              dev = dev->bus->self;
++      }
++      if (sscanf(dev_name(&dev->dev), "%04x:%02x", &sbdf->seg, &sbdf->bus) != 2)
++              goto err_end;
++      return TRUE;
++
++err_end:
++      pci_free_sbdf(sbdf);
++      return FALSE;
++}
++
++/* Does PCI device belong to sub tree specified by guestdev with sbdf? */
++typedef int (*pci_sbdf_match_t)(const struct guestdev *gdev,
++                              const  struct pci_dev *dev);
++
++static int pci_sbdf_match(const struct guestdev *gdev,
++                        const struct pci_dev *dev)
++{
++      int seg, bus;
++
++      if (sscanf(dev_name(&dev->dev), "%04x:%02x", &seg, &bus) != 2)
++              return FALSE;
++
++      return gdev->u.sbdf.seg == seg &&
++              gdev->u.sbdf.bus == bus &&
++              gdev->u.sbdf.dev == PCI_SLOT(dev->devfn) &&
++              gdev->u.sbdf.func == PCI_FUNC(dev->devfn);
++}
++
++static int pci_is_in_sbdf_sub_tree(struct guestdev *gdev, struct pci_dev *dev,
++                                 pci_sbdf_match_t match)
++{
++      BUG_ON(!(gdev->flags & GUESTDEV_FLAG_SBDF));
++      for (;;) {
++              if (match(gdev, dev))
++                      return TRUE;
++              if (!dev->bus || !dev->bus->self)
++                      break;
++              dev = dev->bus->self;
++      }
++      return FALSE;
++}
++
++/* Does PCI device belong to sub tree specified by guestdev parameter? */
++static int __pci_is_guestdev(struct pci_dev *dev, pci_node_match_t node_match,
++                           pci_sbdf_match_t sbdf_match)
++{
++      struct guestdev *gdev;
++      struct pcidev_sbdf pcidev_sbdf, *sbdf = NULL;
++      struct list_head *head;
++      int result = FALSE;
++
++      if (!dev)
++              return FALSE;
++
++      list_for_each(head, &guestdev_list) {
++              gdev = list_entry(head, struct guestdev, root_list);
++              switch (gdev->flags & GUESTDEV_FLAG_TYPE_MASK) {
++              case GUESTDEV_FLAG_DEVICEPATH:
++                      if (sbdf == NULL) {
++                              sbdf = &pcidev_sbdf;
++                              memset(sbdf, 0 ,sizeof(*sbdf));
++                              if (!pci_get_sbdf_from_pcidev(dev, sbdf))
++                                      goto out;
++                      }
++                      if (pci_is_in_devicepath_sub_tree(gdev, sbdf,
++                                                        node_match)) {
++                              result = TRUE;
++                              goto out;
++                      }
++                      break;
++              case GUESTDEV_FLAG_SBDF:
++                      if (pci_is_in_sbdf_sub_tree(gdev, dev, sbdf_match)) {
++                              result = TRUE;
++                              goto out;
++                      }
++                      break;
++              default:
++                      BUG();
++              }
++      }
++out:
++      if (sbdf)
++              pci_free_sbdf(sbdf);
++      return result;
++}
++
++int pci_is_guestdev(struct pci_dev *dev)
++{
++      return __pci_is_guestdev(dev, pci_node_match, pci_sbdf_match);
++}
++EXPORT_SYMBOL_GPL(pci_is_guestdev);
++
++static int reassign_resources;
++
++static int __init pci_set_reassign_resources(char *str)
++{
++      if (str && !strcmp(str, "all"))
++              reassign_resources = -1;
++      else
++              reassign_resources = 1;
++
++      return 1;
++}
++__setup("reassign_resources", pci_set_reassign_resources);
++
++int pci_is_guestdev_to_reassign(struct pci_dev *dev)
++{
++      if (reassign_resources < 0)
++              return TRUE;
++      if (reassign_resources)
++              return pci_is_guestdev(dev);
++      return FALSE;
++}
++
++#if defined(CONFIG_PCI_IOMULTI) || defined(CONFIG_PCI_IOMULTI_MODULE)
++static int pci_iomul_node_match(const struct devicepath_node *gdev_node,
++                              const struct pcidev_sbdf_node *sbdf_node,
++                              int options)
++{
++      return (options & GUESTDEV_OPT_IOMUL) &&
++              ((gdev_node->child != NULL &&
++                sbdf_node->child != NULL &&
++                gdev_node->dev == sbdf_node->dev &&
++                gdev_node->func == sbdf_node->func) ||
++               (gdev_node->child == NULL &&
++                sbdf_node->child == NULL &&
++                gdev_node->dev == sbdf_node->dev));
++}
++
++static int pci_iomul_sbdf_match(const struct guestdev *gdev,
++                              const struct pci_dev *dev)
++{
++      int seg, bus;
++
++      if (sscanf(dev_name(&dev->dev), "%04x:%02x", &seg, &bus) != 2)
++              return FALSE;
++
++      return (gdev->options & GUESTDEV_OPT_IOMUL) &&
++              gdev->u.sbdf.seg == seg &&
++              gdev->u.sbdf.bus == bus &&
++              gdev->u.sbdf.dev == PCI_SLOT(dev->devfn);
++}
++
++int pci_is_iomuldev(struct pci_dev *dev)
++{
++      return __pci_is_guestdev(dev,
++                               pci_iomul_node_match, pci_iomul_sbdf_match);
++}
++#endif /* CONFIG_PCI_IOMULTI */
++
++/* Check whether the devicepath exists under the pci root bus */
++static int __init pci_check_devicepath_exists(
++              struct guestdev *gdev, struct pci_bus *bus)
++{
++      struct devicepath_node *node;
++      struct pci_dev *dev;
++
++      BUG_ON(!(gdev->flags & GUESTDEV_FLAG_DEVICEPATH));
++
++      node = gdev->u.devicepath.child;
++      while (node) {
++              if (!bus)
++                      return FALSE;
++              dev = pci_get_slot(bus, PCI_DEVFN(node->dev, node->func));
++              if (!dev)
++                      return FALSE;
++              bus = dev->subordinate;
++              node = node->child;
++              pci_dev_put(dev);
++      }
++      return TRUE;
++}
++
++/* Check whether the guestdev exists in the PCI device tree */
++static int __init pci_check_guestdev_exists(void)
++{
++      struct list_head *head;
++      struct guestdev *gdev;
++      int seg, bbn;
++      struct pci_bus *bus;
++      struct pci_dev *dev;
++      char path_str[GUESTDEV_STR_MAX];
++
++      list_for_each(head, &guestdev_list) {
++              gdev = list_entry(head, struct guestdev, root_list);
++              switch (gdev->flags & GUESTDEV_FLAG_TYPE_MASK) {
++              case GUESTDEV_FLAG_DEVICEPATH:
++                      if (gdev->u.devicepath.seg == INVALID_SEG ||
++                              gdev->u.devicepath.bbn == INVALID_BBN) {
++                              if (acpi_pci_get_root_seg_bbn(
++                                      gdev->u.devicepath.hid,
++                                      gdev->u.devicepath.uid, &seg, &bbn)) {
++                                      gdev->u.devicepath.seg = seg;
++                                      gdev->u.devicepath.bbn = bbn;
++                              } else {
++                                      pci_make_guestdev_str(gdev,
++                                              path_str, GUESTDEV_STR_MAX);
++                                      pr_info("PCI: "
++                                              "device %s does not exist\n",
++                                              path_str);
++                                      continue;
++                              }
++                      }
++
++                      bus = pci_find_bus(gdev->u.devicepath.seg,
++                                              gdev->u.devicepath.bbn);
++                      if (!bus || !pci_check_devicepath_exists(gdev, bus)) {
++                              pci_make_guestdev_str(gdev, path_str,
++                                      GUESTDEV_STR_MAX);
++                              pr_info("PCI: device %s does not exist\n",
++                                      path_str);
++                      }
++                      break;
++              case GUESTDEV_FLAG_SBDF:
++                      bus = pci_find_bus(gdev->u.sbdf.seg, gdev->u.sbdf.bus);
++                      if (bus) {
++                              dev = pci_get_slot(bus,
++                                      PCI_DEVFN(gdev->u.sbdf.dev,
++                                                      gdev->u.sbdf.func));
++                              if (dev) {
++                                      pci_dev_put(dev);
++                                      continue;
++                              }
++                      }
++                      pci_make_guestdev_str(gdev, path_str, GUESTDEV_STR_MAX);
++                      pr_info("PCI: device %s does not exist\n", path_str);
++                      break;
++              default:
++                      BUG();
++              }
++      }
++      return 0;
++}
++
++fs_initcall(pci_check_guestdev_exists);
++
diff --cc drivers/pci/iomulti.c

index 0000000,0000000..7e11492

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/pci/iomulti.c
@@@ -1,0 -1,0 +1,897 @@@
++/*
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
++ *
++ * Copyright (c) 2009 Isaku Yamahata
++ *                    VA Linux Systems Japan K.K.
++ */
++
++#include "iomulti.h"
++#include "pci.h"
++#include <linux/module.h>
++#include <linux/sort.h>
++#include <asm/setup.h>
++
++#define PCI_BUS_MAX           255
++#define PCI_DEV_MAX           31
++
++/* see pci_resource_len */
++static inline resource_size_t pci_iomul_len(const struct resource* r)
++{
++      if (r->start == 0 && r->start == r->end)
++              return 0;
++      return r->end - r->start + 1;
++}
++
++#define ROUND_UP(x, a)                (((x) + (a) - 1) & ~((a) - 1))
++/* stolen from pbus_size_io() */
++static unsigned long pdev_size_io(struct pci_dev *pdev)
++{
++      unsigned long size = 0, size1 = 0;
++      int i;
++
++      for (i = 0; i < PCI_NUM_RESOURCES; i++) {
++              struct resource *r = &pdev->resource[i];
++              unsigned long r_size;
++
++              if (!(r->flags & IORESOURCE_IO))
++                      continue;
++
++              r_size = r->end - r->start + 1;
++
++              if (r_size < 0x400)
++                      /* Might be re-aligned for ISA */
++                      size += r_size;
++              else
++                      size1 += r_size;
++      }
++
++/* To be fixed in 2.5: we should have sort of HAVE_ISA
++   flag in the struct pci_bus. */
++#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
++      size = (size & 0xff) + ((size & ~0xffUL) << 2);
++#endif
++      size = ROUND_UP(size + size1, 4096);
++      return size;
++}
++
++/*
++ * primary bus number of PCI-PCI bridge in switch on which
++ * this slots sits.
++ * i.e. the primary bus number of PCI-PCI bridge of downstream port
++ *      or root port in switch.
++ *      the secondary bus number of PCI-PCI bridge of upstream port
++ *      in switch.
++ */
++static inline unsigned char pci_dev_switch_busnr(struct pci_dev *pdev)
++{
++      if (pci_find_capability(pdev, PCI_CAP_ID_EXP))
++              return pdev->bus->primary;
++      return pdev->bus->number;
++}
++
++static LIST_HEAD(switch_list);
++static DEFINE_MUTEX(switch_list_lock);
++
++/*****************************************************************************/
++int pci_iomul_switch_io_allocated(const struct pci_iomul_switch *sw)
++{
++      return !(sw->io_base == 0 || sw->io_base > sw->io_limit);
++}
++EXPORT_SYMBOL_GPL(pci_iomul_switch_io_allocated);
++
++static struct pci_iomul_switch *pci_iomul_find_switch_locked(int segment,
++                                                           uint8_t bus)
++{
++      struct pci_iomul_switch *sw;
++
++      BUG_ON(!mutex_is_locked(&switch_list_lock));
++      list_for_each_entry(sw, &switch_list, list) {
++              if (sw->segment == segment && sw->bus == bus)
++                      return sw;
++      }
++      return NULL;
++}
++
++static struct pci_iomul_slot *pci_iomul_find_slot_locked(
++      struct pci_iomul_switch *sw, uint8_t busnr, uint8_t dev)
++{
++      struct pci_iomul_slot *slot;
++
++      BUG_ON(!mutex_is_locked(&sw->lock));
++      list_for_each_entry(slot, &sw->slots, sibling) {
++              if (slot->bus == busnr && slot->dev == dev)
++                      return slot;
++      }
++      return NULL;
++}
++
++/* on successfull exit, sw->lock is locked for use slot and
++ * refrence count of sw is incremented.
++ */
++void pci_iomul_get_lock_switch(struct pci_dev *pdev,
++                             struct pci_iomul_switch **swp,
++                             struct pci_iomul_slot **slot)
++{
++      mutex_lock(&switch_list_lock);
++
++      *swp = pci_iomul_find_switch_locked(pci_domain_nr(pdev->bus),
++                                          pci_dev_switch_busnr(pdev));
++      if (*swp == NULL) {
++              *slot = NULL;
++              goto out;
++      }
++
++      mutex_lock(&(*swp)->lock);
++      *slot = pci_iomul_find_slot_locked(*swp, pdev->bus->number,
++                                         PCI_SLOT(pdev->devfn));
++      if (*slot == NULL) {
++              mutex_unlock(&(*swp)->lock);
++              *swp = NULL;
++      } else {
++              pci_iomul_switch_get(*swp);
++      }
++out:
++      mutex_unlock(&switch_list_lock);
++}
++EXPORT_SYMBOL_GPL(pci_iomul_get_lock_switch);
++
++static struct pci_iomul_switch *pci_iomul_switch_alloc(int segment,
++                                                     uint8_t bus)
++{
++      struct pci_iomul_switch *sw;
++
++      BUG_ON(!mutex_is_locked(&switch_list_lock));
++
++      sw = kmalloc(sizeof(*sw), GFP_KERNEL);
++
++      mutex_init(&sw->lock);
++      kref_init(&sw->kref);
++      sw->io_region = NULL;
++      sw->count = 0;
++      sw->current_pdev = NULL;
++      sw->segment = segment;
++      sw->bus = bus;
++      sw->io_base = 0;
++      sw->io_limit = 0;
++      sw->func = NULL;
++      INIT_LIST_HEAD(&sw->slots);
++
++      return sw;
++}
++
++static void pci_iomul_switch_add_locked(struct pci_iomul_switch *sw)
++{
++      BUG_ON(!mutex_is_locked(&switch_list_lock));
++      list_add(&sw->list, &switch_list);
++}
++
++#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
++static void pci_iomul_switch_del_locked(struct pci_iomul_switch *sw)
++{
++      BUG_ON(!mutex_is_locked(&switch_list_lock));
++      list_del(&sw->list);
++}
++#endif
++
++static int __devinit pci_iomul_slot_init(struct pci_dev *pdev,
++                                       struct pci_iomul_slot *slot)
++{
++      u16 rpcap;
++      u16 cap;
++
++      rpcap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
++      if (!rpcap) {
++              /* pci device isn't supported */
++              pr_info("PCI: sharing io port of non PCIe device %s "
++                      "isn't supported. ignoring.\n",
++                      pci_name(pdev));
++              return -ENOSYS;
++      }
++
++      pci_read_config_word(pdev, rpcap + PCI_CAP_FLAGS, &cap);
++      switch ((cap & PCI_EXP_FLAGS_TYPE) >> 4) {
++      case PCI_EXP_TYPE_RC_END:
++              pr_info("PCI: io port sharing of root complex integrated "
++                      "endpoint %s isn't supported. ignoring.\n",
++                      pci_name(pdev));
++              return -ENOSYS;
++      case PCI_EXP_TYPE_ENDPOINT:
++      case PCI_EXP_TYPE_LEG_END:
++              break;
++      default:
++              pr_info("PCI: io port sharing of non endpoint %s "
++                      "doesn't make sense. ignoring.\n",
++                      pci_name(pdev));
++              return -EINVAL;
++      }
++
++      kref_init(&slot->kref);
++      slot->switch_busnr = pci_dev_switch_busnr(pdev);
++      slot->segment = pci_domain_nr(pdev->bus);
++      slot->bus = pdev->bus->number;
++      slot->dev = PCI_SLOT(pdev->devfn);
++
++      return 0;
++}
++
++static struct pci_iomul_slot *__devinit
++pci_iomul_slot_alloc(struct pci_dev *pdev)
++{
++      struct pci_iomul_slot *slot;
++
++      slot = kzalloc(sizeof(*slot), GFP_KERNEL);
++      if (slot == NULL)
++              return NULL;
++
++      if (pci_iomul_slot_init(pdev, slot) != 0) {
++              kfree(slot);
++              return NULL;
++      }
++      return slot;
++}
++
++static void pci_iomul_slot_add_locked(struct pci_iomul_switch *sw,
++                                    struct pci_iomul_slot *slot)
++{
++      BUG_ON(!mutex_is_locked(&sw->lock));
++      list_add(&slot->sibling, &sw->slots);
++}
++
++#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
++static void pci_iomul_slot_del_locked(struct pci_iomul_switch *sw,
++                                     struct pci_iomul_slot *slot)
++{
++      BUG_ON(!mutex_is_locked(&sw->lock));
++      list_del(&slot->sibling);
++}
++#endif
++
++/*****************************************************************************/
++static int pci_get_sbd(const char *str,
++                     int *segment__, uint8_t *bus__, uint8_t *dev__)
++{
++      int segment;
++      int bus;
++      int dev;
++
++      if (sscanf(str, "%x:%x:%x", &segment, &bus, &dev) != 3) {
++              if (sscanf(str, "%x:%x", &bus, &dev) == 2)
++                      segment = 0;
++              else
++                      return -EINVAL;
++      }
++
++      if (segment < 0 || INT_MAX <= segment)
++              return -EINVAL;
++      if (bus < 0 || PCI_BUS_MAX < bus)
++              return -EINVAL;
++      if (dev < 0 || PCI_DEV_MAX < dev)
++              return -EINVAL;
++
++      *segment__ = segment;
++      *bus__ = bus;
++      *dev__ = dev;
++      return 0;
++}
++
++static char iomul_param[COMMAND_LINE_SIZE];
++#define TOKEN_MAX     10      /* SSSS:BB:DD length is 10 */
++static int pci_is_iomul_dev_param(struct pci_dev *pdev)
++{
++      int len;
++      char *p;
++      char *next_str;
++
++      if (!strcmp(iomul_param, "all"))
++              return 1;
++      for (p = &iomul_param[0]; *p != '\0'; p = next_str + 1) {
++              next_str = strchr(p, ',');
++              if (next_str != NULL)
++                      len = next_str - p;
++              else
++                      len = strlen(p);
++
++              if (len > 0 && len <= TOKEN_MAX) {
++                      char tmp[TOKEN_MAX+1];
++                      int seg;
++                      uint8_t bus;
++                      uint8_t dev;
++
++                      strlcpy(tmp, p, len);
++                      if (pci_get_sbd(tmp, &seg, &bus, &dev) == 0 &&
++                          pci_domain_nr(pdev->bus) == seg &&
++                          pdev->bus->number == bus &&
++                          PCI_SLOT(pdev->devfn) == dev)
++                              return 1;
++              }
++              if (next_str == NULL)
++                      break;
++      }
++
++      /* check guestdev=<device>+iomul option */
++      return pci_is_iomuldev(pdev);
++}
++
++/*
++ * Format: [<segment>:]<bus>:<dev>[,[<segment>:]<bus>:<dev>[,...]
++ */
++static int __init pci_iomul_param_setup(char *str)
++{
++      if (strlen(str) >= COMMAND_LINE_SIZE)
++              return 0;
++
++      /* parse it after pci bus scanning */
++      strlcpy(iomul_param, str, sizeof(iomul_param));
++      return 1;
++}
++__setup("guestiomuldev=", pci_iomul_param_setup);
++
++/*****************************************************************************/
++static void __devinit pci_iomul_set_bridge_io_window(struct pci_dev *bridge,
++                                                   uint32_t io_base,
++                                                   uint32_t io_limit)
++{
++      uint16_t l;
++      uint32_t upper16;
++
++      io_base >>= 12;
++      io_base <<= 4;
++      io_limit >>= 12;
++      io_limit <<= 4;
++      l = (io_base & 0xff) | ((io_limit & 0xff) << 8);
++      upper16 = ((io_base & 0xffff00) >> 8) |
++              (((io_limit & 0xffff00) >> 8) << 16);
++
++      /* Temporarily disable the I/O range before updating PCI_IO_BASE. */
++      pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, 0x0000ffff);
++      /* Update lower 16 bits of I/O base/limit. */
++      pci_write_config_word(bridge, PCI_IO_BASE, l);
++      /* Update upper 16 bits of I/O base/limit. */
++      pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, upper16);
++}
++
++static void __devinit pci_disable_bridge_io_window(struct pci_dev *bridge)
++{
++      /* set base = 0xffffff limit = 0x0 */
++      pci_iomul_set_bridge_io_window(bridge, 0xffffff, 0);
++}
++
++static int __devinit pci_iomul_func_scan(struct pci_dev *pdev,
++                                       struct pci_iomul_slot *slot,
++                                       uint8_t func)
++{
++      struct pci_iomul_func *f;
++      unsigned int i;
++
++      f = kzalloc(sizeof(*f), GFP_KERNEL);
++      if (f == NULL)
++              return -ENOMEM;
++
++      f->segment = slot->segment;
++      f->bus = slot->bus;
++      f->devfn = PCI_DEVFN(slot->dev, func);
++      f->io_size = pdev_size_io(pdev);
++
++      for (i = 0; i < PCI_NUM_BARS; i++) {
++              if (!(pci_resource_flags(pdev, i) & IORESOURCE_IO))
++                      continue;
++              if (pci_resource_len(pdev, i) == 0)
++                      continue;
++
++              f->io_bar |= 1 << i;
++              f->resource[i] = pdev->resource[i];
++      }
++
++      if (f->io_bar)
++              slot->func[func] = f;
++      else
++              kfree(f);
++      return 0;
++}
++
++/*
++ * This is tricky part.
++ * fake PCI resource assignment routines by setting flags to 0.
++ * PCI resource allocate routines think the resource should
++ * be allocated by checking flags. 0 means this resource isn't used.
++ * See pbus_size_io() and pdev_sort_resources().
++ *
++ * After allocated resources, flags (IORESOURCE_IO) is exported
++ * to other part including user process.
++ * So we have to set flags to IORESOURCE_IO, but at the same time
++ * we must prevent those resources from reassigning when pci hot plug.
++ * To achieve that, set r->parent to dummy resource.
++ */
++static void __devinit pci_iomul_disable_resource(struct resource *r)
++{
++      /* don't allocate this resource */
++      r->flags = 0;
++}
++
++static void __devinit pci_iomul_reenable_resource(
++      struct resource *dummy_parent, struct resource *r)
++{
++      int ret;
++
++      dummy_parent->start = r->start;
++      dummy_parent->end = r->end;
++      dummy_parent->flags = r->flags;
++      dummy_parent->name = "PCI IOMUL dummy resource";
++
++      ret = request_resource(dummy_parent, r);
++      BUG_ON(ret);
++}
++
++static void __devinit pci_iomul_fixup_ioresource(struct pci_dev *pdev,
++                                               struct pci_iomul_func *func,
++                                               int reassign, int dealloc)
++{
++      uint8_t i;
++      struct resource *r;
++
++      pr_info("PCI: deallocating io resource[%s]. io size 0x%lx\n",
++              pci_name(pdev), func->io_size);
++      for (i = 0; i < PCI_NUM_BARS; i++) {
++              r = &pdev->resource[i];
++              if (!(func->io_bar & (1 << i)))
++                      continue;
++
++              if (reassign) {
++                      r->end -= r->start;
++                      r->start = 0;
++                      pci_update_resource(pdev, i);
++                      func->resource[i] = *r;
++              }
++
++              if (dealloc)
++                      /* don't allocate this resource */
++                      pci_iomul_disable_resource(r);
++      }
++
++      /* parent PCI-PCI bridge */
++      if (!reassign)
++              return;
++      pdev = pdev->bus->self;
++      if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
++              return;
++      pci_disable_bridge_io_window(pdev);
++      for (i = 0; i < PCI_NUM_RESOURCES; i++) {
++              r = &pdev->resource[i];
++              if (!(r->flags & IORESOURCE_IO))
++                      continue;
++
++              r->end -= r->start;
++              r->start = 0;
++              if (i < PCI_BRIDGE_RESOURCES)
++                      pci_update_resource(pdev, i);
++      }
++}
++
++static void __devinit __quirk_iomul_dealloc_ioresource(
++      struct pci_iomul_switch *sw,
++      struct pci_dev *pdev, struct pci_iomul_slot *slot)
++{
++      struct pci_iomul_func *f;
++      struct pci_iomul_func *__f;
++
++      if (pci_iomul_func_scan(pdev, slot, PCI_FUNC(pdev->devfn)) != 0)
++              return;
++
++      f = slot->func[PCI_FUNC(pdev->devfn)];
++      if (f == NULL)
++              return;
++
++      __f = sw->func;
++      /* sw->io_base == 0 means that we are called at boot time.
++       * != 0 means that we are called by php after boot. */
++      if (sw->io_base == 0 &&
++          (__f == NULL || __f->io_size < f->io_size)) {
++              if (__f != NULL) {
++                      struct pci_bus *__pbus;
++                      struct pci_dev *__pdev;
++
++                      __pbus = pci_find_bus(__f->segment, __f->bus);
++                      BUG_ON(__pbus == NULL);
++                      __pdev = pci_get_slot(__pbus, __f->devfn);
++                      BUG_ON(__pdev == NULL);
++                      pci_iomul_fixup_ioresource(__pdev, __f, 0, 1);
++                      pci_dev_put(__pdev);
++              }
++
++              pci_iomul_fixup_ioresource(pdev, f, 1, 0);
++              sw->func = f;
++      } else {
++              pci_iomul_fixup_ioresource(pdev, f, 1, 1);
++      }
++}
++
++static void __devinit quirk_iomul_dealloc_ioresource(struct pci_dev *pdev)
++{
++      struct pci_iomul_switch *sw;
++      struct pci_iomul_slot *slot;
++
++      if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
++              return;
++      if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
++              return; /* PCI Host Bridge isn't a target device */
++      if (!pci_is_iomul_dev_param(pdev))
++              return;
++
++      mutex_lock(&switch_list_lock);
++      sw = pci_iomul_find_switch_locked(pci_domain_nr(pdev->bus),
++                                        pci_dev_switch_busnr(pdev));
++      if (sw == NULL) {
++              sw = pci_iomul_switch_alloc(pci_domain_nr(pdev->bus),
++                                          pci_dev_switch_busnr(pdev));
++              if (sw == NULL) {
++                      mutex_unlock(&switch_list_lock);
++                      pr_warn("PCI: can't allocate memory "
++                              "for sw of IO multiplexing %s",
++                              pci_name(pdev));
++                      return;
++              }
++              pci_iomul_switch_add_locked(sw);
++      }
++      pci_iomul_switch_get(sw);
++      mutex_unlock(&switch_list_lock);
++
++      mutex_lock(&sw->lock);
++      slot = pci_iomul_find_slot_locked(sw, pdev->bus->number,
++                                        PCI_SLOT(pdev->devfn));
++      if (slot == NULL) {
++              slot = pci_iomul_slot_alloc(pdev);
++              if (slot == NULL) {
++                      mutex_unlock(&sw->lock);
++                      pci_iomul_switch_put(sw);
++                      pr_warn("PCI: can't allocate memory "
++                              "for IO multiplexing %s", pci_name(pdev));
++                      return;
++              }
++              pci_iomul_slot_add_locked(sw, slot);
++      }
++
++      pr_info("PCI: disable device and release io resource[%s].\n",
++              pci_name(pdev));
++      pci_disable_device(pdev);
++
++      __quirk_iomul_dealloc_ioresource(sw, pdev, slot);
++
++      mutex_unlock(&sw->lock);
++      pci_iomul_switch_put(sw);
++}
++DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID,
++                       quirk_iomul_dealloc_ioresource);
++
++static void __devinit pci_iomul_read_bridge_io(struct pci_iomul_switch *sw)
++{
++      struct pci_iomul_func *f = sw->func;
++
++      struct pci_bus *pbus;
++      struct pci_dev *pdev;
++      struct pci_dev *bridge;
++
++      uint16_t l;
++      uint16_t base_upper16;
++      uint16_t limit_upper16;
++      uint32_t io_base;
++      uint32_t io_limit;
++
++      pbus = pci_find_bus(f->segment, f->bus);
++      BUG_ON(pbus == NULL);
++
++      pdev = pci_get_slot(pbus, f->devfn);
++      BUG_ON(pdev == NULL);
++
++      bridge = pdev->bus->self;
++      pci_read_config_word(bridge, PCI_IO_BASE, &l);
++      pci_read_config_word(bridge, PCI_IO_BASE_UPPER16, &base_upper16);
++      pci_read_config_word(bridge, PCI_IO_LIMIT_UPPER16, &limit_upper16);
++
++      io_base = (l & 0xf0) | ((uint32_t)base_upper16 << 8);
++      io_base <<= 8;
++      io_limit = (l >> 8) | ((uint32_t)limit_upper16 << 8);
++      io_limit <<= 8;
++      io_limit |= 0xfff;
++
++      sw->io_base = io_base;
++      sw->io_limit = io_limit;
++
++      pci_dev_put(pdev);
++      pr_info("PCI: bridge %s base 0x%x limit 0x%x\n",
++              pci_name(bridge), sw->io_base, sw->io_limit);
++}
++
++static void __devinit pci_iomul_setup_brige(struct pci_dev *bridge,
++                                          uint32_t io_base,
++                                          uint32_t io_limit)
++{
++      uint16_t cmd;
++
++      if ((bridge->class >> 8) == PCI_CLASS_BRIDGE_HOST)
++              return;
++
++      pci_iomul_set_bridge_io_window(bridge, io_base, io_limit);
++
++      /* and forcibly enables IO */
++      pci_read_config_word(bridge, PCI_COMMAND, &cmd);
++      if (!(cmd & PCI_COMMAND_IO)) {
++              cmd |= PCI_COMMAND_IO;
++              pr_info("PCI: forcibly enabling IO %s\n", pci_name(bridge));
++              pci_write_config_word(bridge, PCI_COMMAND, cmd);
++      }
++}
++
++struct __bar {
++      unsigned long size;
++      uint8_t bar;
++};
++
++/* decending order */
++static int __devinit pci_iomul_bar_cmp(const void *lhs__, const void *rhs__)
++{
++      const struct __bar *lhs = (struct __bar*)lhs__;
++      const struct __bar *rhs = (struct __bar*)rhs__;
++      return - (lhs->size - rhs->size);
++}
++
++static void __devinit pci_iomul_setup_dev(struct pci_dev *pdev,
++                                        struct pci_iomul_func *f,
++                                        uint32_t io_base)
++{
++      struct __bar bars[PCI_NUM_BARS];
++      int i;
++      uint8_t num_bars = 0;
++      struct resource *r;
++
++      pr_info("PCI: Forcibly assign IO %s from 0x%x\n",
++              pci_name(pdev), io_base);
++
++      for (i = 0; i < PCI_NUM_BARS; i++) {
++              if (!(f->io_bar & (1 << i)))
++                      continue;
++
++              r = &f->resource[i];
++              bars[num_bars].size = pci_iomul_len(r);
++              bars[num_bars].bar = i;
++
++              num_bars++;
++      }
++
++      sort(bars, num_bars, sizeof(bars[0]), &pci_iomul_bar_cmp, NULL);
++
++      for (i = 0; i < num_bars; i++) {
++              struct resource *fr = &f->resource[bars[i].bar];
++              r = &pdev->resource[bars[i].bar];
++
++              BUG_ON(r->start != 0);
++              r->start += io_base;
++              r->end += io_base;
++
++              fr->start = r->start;
++              fr->end = r->end;
++
++              /* pci_update_resource() check flags. */
++              r->flags = fr->flags;
++              pci_update_resource(pdev, bars[i].bar);
++              pci_iomul_reenable_resource(&f->dummy_parent, r);
++
++              io_base += bars[i].size;
++      }
++}
++
++static void __devinit pci_iomul_release_io_resource(
++      struct pci_dev *pdev, struct pci_iomul_switch *sw,
++      struct pci_iomul_slot *slot, struct pci_iomul_func *f)
++{
++      int i;
++      struct resource *r;
++
++      for (i = 0; i < PCI_NUM_BARS; i++) {
++              if (pci_resource_flags(pdev, i) & IORESOURCE_IO &&
++                  pdev->resource[i].parent != NULL) {
++                      r = &pdev->resource[i];
++                      f->resource[i] = *r;
++                      release_resource(r);
++                      pci_iomul_reenable_resource(&f->dummy_parent, r);
++              }
++      }
++
++      /* parent PCI-PCI bridge */
++      pdev = pdev->bus->self;
++      if ((pdev->class >> 8) != PCI_CLASS_BRIDGE_HOST) {
++              for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) {
++                      struct resource *parent = pdev->resource[i].parent;
++
++                      if (pci_resource_flags(pdev, i) & IORESOURCE_IO &&
++                          parent != NULL) {
++                              r = &pdev->resource[i];
++
++                              sw->io_resource.flags = r->flags;
++                              sw->io_resource.start = sw->io_base;
++                              sw->io_resource.end = sw->io_limit;
++                              sw->io_resource.name = "PCI IO Multiplexer";
++
++                              release_resource(r);
++                              pci_iomul_reenable_resource(
++                                      &slot->dummy_parent[i - PCI_BRIDGE_RESOURCES], r);
++
++                              if (request_resource(parent,
++                                                   &sw->io_resource))
++                                      pr_err("PCI IOMul: can't allocate "
++                                             "resource. [0x%x, 0x%x]",
++                                             sw->io_base, sw->io_limit);
++                      }
++              }
++      }
++}
++
++static void __devinit quirk_iomul_reassign_ioresource(struct pci_dev *pdev)
++{
++      struct pci_iomul_switch *sw;
++      struct pci_iomul_slot *slot;
++      struct pci_iomul_func *sf;
++      struct pci_iomul_func *f;
++
++      pci_iomul_get_lock_switch(pdev, &sw, &slot);
++      if (sw == NULL || slot == NULL)
++              return;
++
++      if (sw->io_base == 0)
++              pci_iomul_read_bridge_io(sw);
++      if (!pci_iomul_switch_io_allocated(sw))
++              goto out;
++
++      sf = sw->func;
++      f = slot->func[PCI_FUNC(pdev->devfn)];
++      if (f == NULL)
++              /* (sf == NULL || f == NULL) case
++               * can happen when all the specified devices
++               * don't have io space
++               */
++              goto out;
++
++      if (sf != NULL &&
++          (pci_domain_nr(pdev->bus) != sf->segment ||
++           pdev->bus->number != sf->bus ||
++           PCI_SLOT(pdev->devfn) != PCI_SLOT(sf->devfn)) &&
++          PCI_FUNC(pdev->devfn) == 0) {
++              pci_iomul_setup_brige(pdev->bus->self,
++                                    sw->io_base, sw->io_limit);
++      }
++
++      BUG_ON(f->io_size > sw->io_limit - sw->io_base + 1);
++      if (/* f == sf */ sf != NULL &&
++          pci_domain_nr(pdev->bus) == sf->segment &&
++          pdev->bus->number == sf->bus &&
++          pdev->devfn == sf->devfn)
++              pci_iomul_release_io_resource(pdev, sw, slot, f);
++      else
++              pci_iomul_setup_dev(pdev, f, sw->io_base);
++
++out:
++      mutex_unlock(&sw->lock);
++      pci_iomul_switch_put(sw);
++}
++
++DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID,
++                      quirk_iomul_reassign_ioresource);
++
++/*****************************************************************************/
++#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
++static int __devinit __pci_iomul_notifier_del_device(struct pci_dev *pdev)
++{
++      struct pci_iomul_switch *sw;
++      struct pci_iomul_slot *slot;
++      int i;
++
++      pci_iomul_get_lock_switch(pdev, &sw, &slot);
++      if (sw == NULL || slot == NULL)
++              return 0;
++
++      if (sw->func == slot->func[PCI_FUNC(pdev->devfn)])
++              sw->func = NULL;
++      kfree(slot->func[PCI_FUNC(pdev->devfn)]);
++      slot->func[PCI_FUNC(pdev->devfn)] = NULL;
++      for (i = 0; i < PCI_NUM_FUNC; i++) {
++              if (slot->func[i] != NULL)
++                      goto out;
++      }
++
++      pci_iomul_slot_del_locked(sw, slot);
++      pci_iomul_slot_put(slot);
++
++out:
++      mutex_unlock(&sw->lock);
++      pci_iomul_switch_put(sw);
++      return 0;
++}
++
++static int __devinit __pci_iomul_notifier_del_switch(struct pci_dev *pdev)
++{
++      struct pci_iomul_switch *sw;
++
++      mutex_lock(&switch_list_lock);
++      sw = pci_iomul_find_switch_locked(pci_domain_nr(pdev->bus),
++                                        pdev->bus->number);
++      if (sw == NULL)
++              goto out;
++
++      pci_iomul_switch_del_locked(sw);
++
++      mutex_lock(&sw->lock);
++      if (sw->io_resource.parent)
++              release_resource(&sw->io_resource);
++      sw->io_base = 0;        /* to tell this switch is removed */
++      sw->io_limit = 0;
++      BUG_ON(!list_empty(&sw->slots));
++      mutex_unlock(&sw->lock);
++
++out:
++      mutex_unlock(&switch_list_lock);
++      pci_iomul_switch_put(sw);
++      return 0;
++}
++
++static int __devinit pci_iomul_notifier_del_device(struct pci_dev *pdev)
++{
++      int ret;
++      switch (pdev->hdr_type) {
++      case PCI_HEADER_TYPE_NORMAL:
++              ret = __pci_iomul_notifier_del_device(pdev);
++              break;
++      case PCI_HEADER_TYPE_BRIDGE:
++              ret = __pci_iomul_notifier_del_switch(pdev);
++              break;
++      default:
++              pr_warn("PCI IOMUL: device %s has unknown "
++                      "header type %02x, ignoring.\n",
++                      pci_name(pdev), pdev->hdr_type);
++              ret = -EIO;
++              break;
++      }
++      return ret;
++}
++
++static int __devinit pci_iomul_notifier(struct notifier_block *nb,
++                                      unsigned long action, void *data)
++{
++      struct device *dev = data;
++      struct pci_dev *pdev = to_pci_dev(dev);
++
++      switch (action) {
++      case BUS_NOTIFY_ADD_DEVICE:
++              quirk_iomul_reassign_ioresource(pdev);
++              break;
++      case BUS_NOTIFY_DEL_DEVICE:
++              return pci_iomul_notifier_del_device(pdev);
++      default:
++              /* nothing */
++              break;
++      }
++
++      return 0;
++}
++
++static struct notifier_block __devinitdata pci_iomul_nb = {
++      .notifier_call = pci_iomul_notifier,
++};
++
++static int __init pci_iomul_hotplug_init(void)
++{
++      bus_register_notifier(&pci_bus_type, &pci_iomul_nb);
++      return 0;
++}
++late_initcall(pci_iomul_hotplug_init);
++#endif
diff --cc drivers/pci/iomulti.h

index 0000000,0000000..511ef5f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/pci/iomulti.h
@@@ -1,0 -1,0 +1,122 @@@
++/*
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
++ *
++ * Copyright (c) 2009 Isaku Yamahata
++ *                    VA Linux Systems Japan K.K.
++ */
++
++#include <linux/kernel.h>
++#include <linux/list.h>
++#include <linux/pci.h>
++
++#define PCI_NUM_BARS          6
++#define PCI_NUM_FUNC          8
++
++struct pci_iomul_func {
++      int             segment;
++      uint8_t         bus;
++      uint8_t         devfn;
++
++      /* only start and end are used */
++      unsigned long   io_size;
++      uint8_t         io_bar;
++      struct resource resource[PCI_NUM_BARS];
++      struct resource dummy_parent;
++};
++
++struct pci_iomul_switch {
++      struct list_head        list;   /* bus_list_lock protects */
++
++      /*
++       * This lock the following entry and following
++       * pci_iomul_slot/pci_iomul_func.
++       */
++      struct mutex            lock;
++      struct kref             kref;
++
++      struct resource         io_resource;
++      struct resource         *io_region;
++      unsigned int            count;
++      struct pci_dev          *current_pdev;
++
++      int                     segment;
++      uint8_t                 bus;
++
++      uint32_t                io_base;
++      uint32_t                io_limit;
++
++      /* func which has the largeset io size*/
++      struct pci_iomul_func   *func;
++
++      struct list_head        slots;
++};
++
++static inline void pci_iomul_switch_get(struct pci_iomul_switch *sw)
++{
++      kref_get(&sw->kref);
++}
++
++static inline void pci_iomul_switch_release(struct kref *kref)
++{
++      struct pci_iomul_switch *sw = container_of(kref,
++                                                 struct pci_iomul_switch,
++                                                 kref);
++      kfree(sw);
++}
++
++static inline void pci_iomul_switch_put(struct pci_iomul_switch *sw)
++{
++      kref_put(&sw->kref, &pci_iomul_switch_release);
++}
++
++struct pci_iomul_slot {
++      struct list_head        sibling;
++      struct kref             kref;
++      /*
++       * busnr
++       * when pcie, the primary busnr of the PCI-PCI bridge on which
++       * this devices sits.
++       */
++      uint8_t                 switch_busnr;
++      struct resource         dummy_parent[PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES];
++
++      /* device */
++      int                     segment;
++      uint8_t                 bus;
++      uint8_t                 dev;
++
++      struct pci_iomul_func   *func[PCI_NUM_FUNC];
++};
++
++static inline void pci_iomul_slot_get(struct pci_iomul_slot *slot)
++{
++      kref_get(&slot->kref);
++}
++
++static inline void pci_iomul_slot_release(struct kref *kref)
++{
++      struct pci_iomul_slot *slot = container_of(kref, struct pci_iomul_slot,
++                                                 kref);
++      kfree(slot);
++}
++
++static inline void pci_iomul_slot_put(struct pci_iomul_slot *slot)
++{
++      kref_put(&slot->kref, &pci_iomul_slot_release);
++}
++
++int pci_iomul_switch_io_allocated(const struct pci_iomul_switch *);
++void pci_iomul_get_lock_switch(struct pci_dev *, struct pci_iomul_switch **,
++                             struct pci_iomul_slot **);
diff --cc drivers/pci/msi-xen.c

index 0000000,0000000..c07aa8c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/pci/msi-xen.c
@@@ -1,0 -1,0 +1,794 @@@
++/*
++ * File:      msi.c
++ * Purpose:   PCI Message Signaled Interrupt (MSI)
++ *
++ * Copyright (C) 2003-2004 Intel
++ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
++ */
++
++#include <linux/err.h>
++#include <linux/mm.h>
++#include <linux/irq.h>
++#include <linux/interrupt.h>
++#include <linux/init.h>
++#include <linux/ioport.h>
++#include <linux/pci.h>
++#include <linux/proc_fs.h>
++#include <linux/msi.h>
++#include <linux/smp.h>
++#include <linux/errno.h>
++#include <linux/io.h>
++#include <linux/slab.h>
++
++#include <xen/evtchn.h>
++
++#include "pci.h"
++#include "msi.h"
++
++static int pci_msi_enable = 1;
++
++static LIST_HEAD(msi_dev_head);
++DEFINE_SPINLOCK(msi_dev_lock);
++
++struct msi_dev_list {
++      struct pci_dev *dev;
++      struct list_head list;
++      spinlock_t pirq_list_lock;
++      struct list_head pirq_list_head;
++      /* Store default pre-assigned irq */
++      unsigned int default_irq;
++};
++
++struct msi_pirq_entry {
++      struct list_head list;
++      int pirq;
++      int entry_nr;
++};
++
++/* Arch hooks */
++
++#ifndef arch_msi_check_device
++int arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
++{
++      return 0;
++}
++#endif
++
++static void msi_set_enable(struct pci_dev *dev, int pos, int enable)
++{
++      u16 control;
++
++      BUG_ON(!pos);
++
++      pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
++      control &= ~PCI_MSI_FLAGS_ENABLE;
++      if (enable)
++              control |= PCI_MSI_FLAGS_ENABLE;
++      pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control);
++}
++
++static void msix_set_enable(struct pci_dev *dev, int enable)
++{
++      int pos;
++      u16 control;
++
++      pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
++      if (pos) {
++              pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control);
++              control &= ~PCI_MSIX_FLAGS_ENABLE;
++              if (enable)
++                      control |= PCI_MSIX_FLAGS_ENABLE;
++              pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
++      }
++}
++
++static struct msi_dev_list *get_msi_dev_pirq_list(struct pci_dev *dev)
++{
++      struct msi_dev_list *msi_dev_list, *ret = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&msi_dev_lock, flags);
++
++      list_for_each_entry(msi_dev_list, &msi_dev_head, list)
++              if ( msi_dev_list->dev == dev )
++                      ret = msi_dev_list;
++
++      if ( ret ) {
++              spin_unlock_irqrestore(&msi_dev_lock, flags);
++              return ret;
++      }
++
++      /* Has not allocate msi_dev until now. */
++      ret = kzalloc(sizeof(struct msi_dev_list), GFP_ATOMIC);
++
++      /* Failed to allocate msi_dev structure */
++      if ( !ret ) {
++              spin_unlock_irqrestore(&msi_dev_lock, flags);
++              return NULL;
++      }
++
++      ret->dev = dev;
++      spin_lock_init(&ret->pirq_list_lock);
++      INIT_LIST_HEAD(&ret->pirq_list_head);
++      list_add_tail(&ret->list, &msi_dev_head);
++      spin_unlock_irqrestore(&msi_dev_lock, flags);
++      return ret;
++}
++
++static int attach_pirq_entry(int pirq, int entry_nr,
++                             struct msi_dev_list *msi_dev_entry)
++{
++      struct msi_pirq_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
++      unsigned long flags;
++
++      if (!entry)
++              return -ENOMEM;
++      entry->pirq = pirq;
++      entry->entry_nr = entry_nr;
++      spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
++      list_add_tail(&entry->list, &msi_dev_entry->pirq_list_head);
++      spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
++      return 0;
++}
++
++static void detach_pirq_entry(int entry_nr,
++                                                      struct msi_dev_list *msi_dev_entry)
++{
++      unsigned long flags;
++      struct msi_pirq_entry *pirq_entry;
++
++      list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
++              if (pirq_entry->entry_nr == entry_nr) {
++                      spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
++                      list_del(&pirq_entry->list);
++                      spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
++                      kfree(pirq_entry);
++                      return;
++              }
++      }
++}
++
++/*
++ * pciback will provide device's owner
++ */
++static int (*get_owner)(struct pci_dev *dev);
++
++int register_msi_get_owner(int (*func)(struct pci_dev *dev))
++{
++      if (get_owner) {
++              printk(KERN_WARNING "register msi_get_owner again\n");
++              return -EEXIST;
++      }
++      get_owner = func;
++      return 0;
++}
++EXPORT_SYMBOL(register_msi_get_owner);
++
++int unregister_msi_get_owner(int (*func)(struct pci_dev *dev))
++{
++      if (get_owner != func)
++              return -EINVAL;
++      get_owner = NULL;
++      return 0;
++}
++EXPORT_SYMBOL(unregister_msi_get_owner);
++
++static int msi_get_dev_owner(struct pci_dev *dev)
++{
++      int owner;
++
++      BUG_ON(!is_initial_xendomain());
++      if (get_owner && (owner = get_owner(dev)) >= 0) {
++              dev_info(&dev->dev, "get owner: %x \n", owner);
++              return owner;
++      }
++
++      return DOMID_SELF;
++}
++
++static int msi_unmap_pirq(struct pci_dev *dev, int pirq)
++{
++      struct physdev_unmap_pirq unmap;
++      int rc;
++
++      unmap.domid = msi_get_dev_owner(dev);
++      /* See comments in msi_map_vector, input parameter pirq means
++       * irq number only if the device belongs to dom0 itself.
++       */
++      unmap.pirq = (unmap.domid != DOMID_SELF)
++              ? pirq : evtchn_get_xen_pirq(pirq);
++
++      if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap)))
++              dev_warn(&dev->dev, "unmap irq %d failed\n", pirq);
++
++      if (rc < 0)
++              return rc;
++
++      if (unmap.domid == DOMID_SELF)
++              evtchn_map_pirq(pirq, 0);
++
++      return 0;
++}
++
++static u64 find_table_base(struct pci_dev *dev, int pos)
++{
++      u8 bar;
++      u32 reg;
++      unsigned long flags;
++
++      pci_read_config_dword(dev, msix_table_offset_reg(pos), &reg);
++      bar = reg & PCI_MSIX_FLAGS_BIRMASK;
++
++      flags = pci_resource_flags(dev, bar);
++      if (flags & (IORESOURCE_DISABLED | IORESOURCE_UNSET | IORESOURCE_BUSY))
++              return 0;
++
++      return pci_resource_start(dev, bar);
++}
++
++/*
++ * Protected by msi_lock
++ */
++static int msi_map_vector(struct pci_dev *dev, int entry_nr, u64 table_base)
++{
++      struct physdev_map_pirq map_irq;
++      int rc;
++      domid_t domid = DOMID_SELF;
++
++      domid = msi_get_dev_owner(dev);
++
++      map_irq.domid = domid;
++      map_irq.type = MAP_PIRQ_TYPE_MSI;
++      map_irq.index = -1;
++      map_irq.pirq = -1;
++      map_irq.bus = dev->bus->number;
++      map_irq.devfn = dev->devfn;
++      map_irq.entry_nr = entry_nr;
++      map_irq.table_base = table_base;
++
++      if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq)))
++              dev_warn(&dev->dev, "map irq failed\n");
++
++      if (rc < 0)
++              return rc;
++      /* This happens when MSI support is not enabled in older Xen. */
++      if (rc == 0 && map_irq.pirq < 0)
++              return -ENOSYS;
++
++      BUG_ON(map_irq.pirq <= 0);
++
++      /* If mapping of this particular MSI is on behalf of another domain,
++       * we do not need to get an irq in dom0. This also implies:
++       * dev->irq in dom0 will be 'Xen pirq' if this device belongs to
++       * to another domain, and will be 'Linux irq' if it belongs to dom0.
++       */
++      if (domid == DOMID_SELF) {
++              rc = evtchn_map_pirq(-1, map_irq.pirq);
++              dev_printk(KERN_DEBUG, &dev->dev,
++                         "irq %d (%d) for MSI/MSI-X\n",
++                         rc, map_irq.pirq);
++              return rc;
++      }
++      dev_printk(KERN_DEBUG, &dev->dev, "irq %d for dom%d MSI/MSI-X\n",
++                 map_irq.pirq, domid);
++      return map_irq.pirq;
++}
++
++static void pci_intx_for_msi(struct pci_dev *dev, int enable)
++{
++      if (!(dev->dev_flags & PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG))
++              pci_intx(dev, enable);
++}
++
++void pci_restore_msi_state(struct pci_dev *dev)
++{
++      int rc;
++      struct physdev_restore_msi restore;
++
++      if (!dev->msi_enabled && !dev->msix_enabled)
++              return;
++
++      pci_intx_for_msi(dev, 0);
++      if (dev->msi_enabled) {
++              int pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
++
++              msi_set_enable(dev, pos, 0);
++      }
++      if (dev->msix_enabled)
++              msix_set_enable(dev, 0);
++
++      restore.bus = dev->bus->number;
++      restore.devfn = dev->devfn;
++      rc = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore);
++      WARN(rc && rc != -ENOSYS, "restore_msi -> %d\n", rc);
++}
++EXPORT_SYMBOL_GPL(pci_restore_msi_state);
++
++/**
++ * msi_capability_init - configure device's MSI capability structure
++ * @dev: pointer to the pci_dev data structure of MSI device function
++ * @nvec: number of interrupts to allocate
++ *
++ * Setup the MSI capability structure of the device with the requested
++ * number of interrupts.  A return value of zero indicates the successful
++ * setup of an entry with the new MSI irq.  A negative return value indicates
++ * an error, and a positive return value indicates the number of interrupts
++ * which could have been allocated.
++ */
++static int msi_capability_init(struct pci_dev *dev, int nvec)
++{
++      int pos, pirq;
++      u16 control;
++
++      pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
++      msi_set_enable(dev, pos, 0);    /* Disable MSI during set up */
++
++      pci_read_config_word(dev, msi_control_reg(pos), &control);
++
++      WARN_ON(nvec > 1); /* XXX */
++      pirq = msi_map_vector(dev, 0, 0);
++      if (pirq < 0)
++              return -EBUSY;
++
++      /* Set MSI enabled bits  */
++      pci_intx_for_msi(dev, 0);
++      msi_set_enable(dev, pos, 1);
++      dev->msi_enabled = 1;
++
++      dev->irq = pirq;
++      return 0;
++}
++
++/**
++ * msix_capability_init - configure device's MSI-X capability
++ * @dev: pointer to the pci_dev data structure of MSI-X device function
++ * @entries: pointer to an array of struct msix_entry entries
++ * @nvec: number of @entries
++ *
++ * Setup the MSI-X capability structure of device function with a
++ * single MSI-X irq. A return of zero indicates the successful setup of
++ * requested MSI-X entries with allocated irqs or non-zero for otherwise.
++ **/
++static int msix_capability_init(struct pci_dev *dev,
++                              struct msix_entry *entries, int nvec)
++{
++      u64 table_base;
++      int pirq, i, j, mapped, pos;
++      u16 control;
++      struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
++      struct msi_pirq_entry *pirq_entry;
++
++      if (!msi_dev_entry)
++              return -ENOMEM;
++
++      msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
++
++      pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
++      pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control);
++
++      /* Ensure MSI-X is disabled while it is set up */
++      control &= ~PCI_MSIX_FLAGS_ENABLE;
++      pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
++
++      table_base = find_table_base(dev, pos);
++      if (!table_base)
++              return -ENODEV;
++
++      /*
++       * Some devices require MSI-X to be enabled before we can touch the
++       * MSI-X registers.  We need to mask all the vectors to prevent
++       * interrupts coming in before they're fully set up.
++       */
++      control |= PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE;
++      pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
++
++      for (i = 0; i < nvec; i++) {
++              mapped = 0;
++              list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
++                      if (pirq_entry->entry_nr == entries[i].entry) {
++                              dev_warn(&dev->dev,
++                                       "msix entry %d was not freed\n",
++                                       entries[i].entry);
++                              (entries + i)->vector = pirq_entry->pirq;
++                              mapped = 1;
++                              break;
++                      }
++              }
++              if (mapped)
++                      continue;
++              pirq = msi_map_vector(dev, entries[i].entry, table_base);
++              if (pirq < 0)
++                      break;
++              attach_pirq_entry(pirq, entries[i].entry, msi_dev_entry);
++              (entries + i)->vector = pirq;
++      }
++
++      if (i != nvec) {
++              int avail = i - 1;
++              for (j = --i; j >= 0; j--) {
++                      msi_unmap_pirq(dev, entries[j].vector);
++                      detach_pirq_entry(entries[j].entry, msi_dev_entry);
++                      entries[j].vector = 0;
++              }
++              /* If we had some success report the number of irqs
++               * we succeeded in setting up.
++               */
++              if (avail <= 0)
++                      avail = -EBUSY;
++              return avail;
++      }
++
++      /* Set MSI-X enabled bits and unmask the function */
++      pci_intx_for_msi(dev, 0);
++      dev->msix_enabled = 1;
++
++      control &= ~PCI_MSIX_FLAGS_MASKALL;
++      pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
++
++      return 0;
++}
++
++/**
++ * pci_msi_check_device - check whether MSI may be enabled on a device
++ * @dev: pointer to the pci_dev data structure of MSI device function
++ * @nvec: how many MSIs have been requested ?
++ * @type: are we checking for MSI or MSI-X ?
++ *
++ * Look at global flags, the device itself, and its parent busses
++ * to determine if MSI/-X are supported for the device. If MSI/-X is
++ * supported return 0, else return an error code.
++ **/
++static int pci_msi_check_device(struct pci_dev *dev, int nvec, int type)
++{
++      struct pci_bus *bus;
++      int ret;
++
++      /* MSI must be globally enabled and supported by the device */
++      if (!pci_msi_enable || !dev || dev->no_msi)
++              return -EINVAL;
++
++      /*
++       * You can't ask to have 0 or less MSIs configured.
++       *  a) it's stupid ..
++       *  b) the list manipulation code assumes nvec >= 1.
++       */
++      if (nvec < 1)
++              return -ERANGE;
++
++      /*
++       * Any bridge which does NOT route MSI transactions from its
++       * secondary bus to its primary bus must set NO_MSI flag on
++       * the secondary pci_bus.
++       * We expect only arch-specific PCI host bus controller driver
++       * or quirks for specific PCI bridges to be setting NO_MSI.
++       */
++      for (bus = dev->bus; bus; bus = bus->parent)
++              if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
++                      return -EINVAL;
++
++      ret = arch_msi_check_device(dev, nvec, type);
++      if (ret)
++              return ret;
++
++      if (!pci_find_capability(dev, type))
++              return -EINVAL;
++
++      return 0;
++}
++
++/**
++ * pci_enable_msi_block - configure device's MSI capability structure
++ * @dev: device to configure
++ * @nvec: number of interrupts to configure
++ *
++ * Allocate IRQs for a device with the MSI capability.
++ * This function returns a negative errno if an error occurs.  If it
++ * is unable to allocate the number of interrupts requested, it returns
++ * the number of interrupts it might be able to allocate.  If it successfully
++ * allocates at least the number of interrupts requested, it returns 0 and
++ * updates the @dev's irq member to the lowest new interrupt number; the
++ * other interrupt numbers allocated to this device are consecutive.
++ */
++extern int pci_frontend_enable_msi(struct pci_dev *dev);
++int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
++{
++      int temp, status, pos, maxvec;
++      u16 msgctl;
++      struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
++
++      pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
++      if (!pos)
++              return -EINVAL;
++      pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
++      maxvec = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1);
++      if (nvec > maxvec)
++              return maxvec;
++
++      status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI);
++      if (status)
++              return status;
++
++#ifdef CONFIG_XEN_PCIDEV_FRONTEND
++      if (!is_initial_xendomain())
++      {
++              int ret;
++
++              temp = dev->irq;
++              WARN_ON(nvec > 1); /* XXX */
++              ret = pci_frontend_enable_msi(dev);
++              if (ret)
++                      return ret;
++
++              dev->irq = evtchn_map_pirq(-1, dev->irq);
++              dev->msi_enabled = 1;
++              msi_dev_entry->default_irq = temp;
++
++              return ret;
++      }
++#endif
++
++      temp = dev->irq;
++
++      /* Check whether driver already requested MSI-X irqs */
++      if (dev->msix_enabled) {
++              dev_info(&dev->dev, "can't enable MSI "
++                       "(MSI-X already enabled)\n");
++              return -EINVAL;
++      }
++
++      status = msi_capability_init(dev, nvec);
++      if ( !status )
++              msi_dev_entry->default_irq = temp;
++
++      return status;
++}
++EXPORT_SYMBOL(pci_enable_msi_block);
++
++extern void pci_frontend_disable_msi(struct pci_dev* dev);
++void pci_msi_shutdown(struct pci_dev *dev)
++{
++      int pirq, pos;
++      struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
++
++      if (!pci_msi_enable || !dev || !dev->msi_enabled)
++              return;
++
++#ifdef CONFIG_XEN_PCIDEV_FRONTEND
++      if (!is_initial_xendomain()) {
++              evtchn_map_pirq(dev->irq, 0);
++              pci_frontend_disable_msi(dev);
++              dev->irq = msi_dev_entry->default_irq;
++              dev->msi_enabled = 0;
++              return;
++      }
++#endif
++
++      pirq = dev->irq;
++      /* Restore dev->irq to its default pin-assertion vector */
++      dev->irq = msi_dev_entry->default_irq;
++      msi_unmap_pirq(dev, pirq);
++
++      /* Disable MSI mode */
++      pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
++      msi_set_enable(dev, pos, 0);
++      pci_intx_for_msi(dev, 1);
++      dev->msi_enabled = 0;
++}
++
++void pci_disable_msi(struct pci_dev *dev)
++{
++      pci_msi_shutdown(dev);
++}
++EXPORT_SYMBOL(pci_disable_msi);
++
++/**
++ * pci_msix_table_size - return the number of device's MSI-X table entries
++ * @dev: pointer to the pci_dev data structure of MSI-X device function
++ */
++int pci_msix_table_size(struct pci_dev *dev)
++{
++      int pos;
++      u16 control;
++
++      pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
++      if (!pos)
++              return 0;
++
++      pci_read_config_word(dev, msi_control_reg(pos), &control);
++      return multi_msix_capable(control);
++}
++
++/**
++ * pci_enable_msix - configure device's MSI-X capability structure
++ * @dev: pointer to the pci_dev data structure of MSI-X device function
++ * @entries: pointer to an array of MSI-X entries
++ * @nvec: number of MSI-X irqs requested for allocation by device driver
++ *
++ * Setup the MSI-X capability structure of device function with the number
++ * of requested irqs upon its software driver call to request for
++ * MSI-X mode enabled on its hardware device function. A return of zero
++ * indicates the successful configuration of MSI-X capability structure
++ * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
++ * Or a return of > 0 indicates that driver request is exceeding the number
++ * of irqs or MSI-X vectors available. Driver should use the returned value to
++ * re-send its request.
++ **/
++extern int pci_frontend_enable_msix(struct pci_dev *dev,
++              struct msix_entry *entries, int nvec);
++int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
++{
++      int status, nr_entries;
++      int i, j, temp;
++      struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
++
++      if (!entries)
++              return -EINVAL;
++
++#ifdef CONFIG_XEN_PCIDEV_FRONTEND
++      if (!is_initial_xendomain()) {
++              struct msi_pirq_entry *pirq_entry;
++              int ret, irq;
++
++              temp = dev->irq;
++              ret = pci_frontend_enable_msix(dev, entries, nvec);
++              if (ret) {
++                      dev_warn(&dev->dev,
++                               "got %x from frontend_enable_msix\n", ret);
++                      return ret;
++              }
++              dev->msix_enabled = 1;
++              msi_dev_entry->default_irq = temp;
++
++              for (i = 0; i < nvec; i++) {
++                      int mapped = 0;
++
++                      list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
++                              if (pirq_entry->entry_nr == entries[i].entry) {
++                                      irq = pirq_entry->pirq;
++                                      BUG_ON(entries[i].vector != evtchn_get_xen_pirq(irq));
++                                      entries[i].vector = irq;
++                                      mapped = 1;
++                                      break;
++                              }
++                      }
++                      if (mapped)
++                              continue;
++                      irq = evtchn_map_pirq(-1, entries[i].vector);
++                      attach_pirq_entry(irq, entries[i].entry, msi_dev_entry);
++                      entries[i].vector = irq;
++              }
++        return 0;
++      }
++#endif
++
++      status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSIX);
++      if (status)
++              return status;
++
++      nr_entries = pci_msix_table_size(dev);
++      if (nvec > nr_entries)
++              return nr_entries;
++
++      /* Check for any invalid entries */
++      for (i = 0; i < nvec; i++) {
++              if (entries[i].entry >= nr_entries)
++                      return -EINVAL;         /* invalid entry */
++              for (j = i + 1; j < nvec; j++) {
++                      if (entries[i].entry == entries[j].entry)
++                              return -EINVAL; /* duplicate entry */
++              }
++      }
++
++      temp = dev->irq;
++      /* Check whether driver already requested for MSI vector */
++      if (dev->msi_enabled) {
++              dev_info(&dev->dev, "can't enable MSI-X "
++                     "(MSI IRQ already assigned)\n");
++              return -EINVAL;
++      }
++
++      status = msix_capability_init(dev, entries, nvec);
++
++      if ( !status )
++              msi_dev_entry->default_irq = temp;
++
++      return status;
++}
++EXPORT_SYMBOL(pci_enable_msix);
++
++extern void pci_frontend_disable_msix(struct pci_dev* dev);
++void pci_msix_shutdown(struct pci_dev *dev)
++{
++      if (!pci_msi_enable || !dev || !dev->msix_enabled)
++              return;
++
++#ifdef CONFIG_XEN_PCIDEV_FRONTEND
++      if (!is_initial_xendomain()) {
++              struct msi_dev_list *msi_dev_entry;
++              struct msi_pirq_entry *pirq_entry, *tmp;
++
++              pci_frontend_disable_msix(dev);
++
++              msi_dev_entry = get_msi_dev_pirq_list(dev);
++              list_for_each_entry_safe(pirq_entry, tmp,
++                                       &msi_dev_entry->pirq_list_head, list) {
++                      evtchn_map_pirq(pirq_entry->pirq, 0);
++                      list_del(&pirq_entry->list);
++                      kfree(pirq_entry);
++              }
++
++              dev->irq = msi_dev_entry->default_irq;
++              dev->msix_enabled = 0;
++              return;
++      }
++#endif
++
++      msi_remove_pci_irq_vectors(dev);
++
++      /* Disable MSI mode */
++      msix_set_enable(dev, 0);
++      pci_intx_for_msi(dev, 1);
++      dev->msix_enabled = 0;
++}
++
++void pci_disable_msix(struct pci_dev *dev)
++{
++      pci_msix_shutdown(dev);
++}
++EXPORT_SYMBOL(pci_disable_msix);
++
++/**
++ * msi_remove_pci_irq_vectors - reclaim MSI(X) irqs to unused state
++ * @dev: pointer to the pci_dev data structure of MSI(X) device function
++ *
++ * Being called during hotplug remove, from which the device function
++ * is hot-removed. All previous assigned MSI/MSI-X irqs, if
++ * allocated for this device function, are reclaimed to unused state,
++ * which may be used later on.
++ **/
++void msi_remove_pci_irq_vectors(struct pci_dev *dev)
++{
++      unsigned long flags;
++      struct msi_dev_list *msi_dev_entry;
++      struct msi_pirq_entry *pirq_entry, *tmp;
++
++      if (!pci_msi_enable || !dev)
++              return;
++
++      msi_dev_entry = get_msi_dev_pirq_list(dev);
++
++      spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
++      if (!list_empty(&msi_dev_entry->pirq_list_head))
++              list_for_each_entry_safe(pirq_entry, tmp,
++                                       &msi_dev_entry->pirq_list_head, list) {
++                      msi_unmap_pirq(dev, pirq_entry->pirq);
++                      list_del(&pirq_entry->list);
++                      kfree(pirq_entry);
++              }
++      spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
++      dev->irq = msi_dev_entry->default_irq;
++}
++
++void pci_no_msi(void)
++{
++      pci_msi_enable = 0;
++}
++
++/**
++ * pci_msi_enabled - is MSI enabled?
++ *
++ * Returns true if MSI has not been disabled by the command-line option
++ * pci=nomsi.
++ **/
++int pci_msi_enabled(void)
++{
++      return pci_msi_enable;
++}
++EXPORT_SYMBOL(pci_msi_enabled);
++
++void pci_msi_init_pci_dev(struct pci_dev *dev)
++{
++#ifndef CONFIG_XEN
++      INIT_LIST_HEAD(&dev->msi_list);
++#endif
++}
diff --cc drivers/pci/pci-iomul.c

index 0000000,0000000..5733387

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/pci/pci-iomul.c
@@@ -1,0 -1,0 +1,437 @@@
++/*
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
++ *
++ * Copyright (c) 2009 Isaku Yamahata
++ *                    VA Linux Systems Japan K.K.
++ */
++
++#include "iomulti.h"
++#include <linux/fs.h>
++#include <linux/miscdevice.h>
++#include <linux/module.h>
++#include <asm/uaccess.h>
++#include <xen/public/iomulti.h>
++
++struct pci_iomul_data {
++      struct mutex lock;
++
++      struct pci_dev *pdev;
++      struct pci_iomul_switch *sw;
++      struct pci_iomul_slot *slot;    /* slot::kref */
++      struct pci_iomul_func **func;   /* when dereferencing,
++                                         sw->lock is necessary */
++};
++
++static int pci_iomul_func_ioport(struct pci_iomul_func *func,
++                               uint8_t bar, uint64_t offset, int *port)
++{
++      if (!(func->io_bar & (1 << bar)))
++              return -EINVAL;
++
++      *port = func->resource[bar].start + offset;
++      if (*port < func->resource[bar].start ||
++          *port > func->resource[bar].end)
++              return -EINVAL;
++
++      return 0;
++}
++
++static inline int pci_iomul_valid(struct pci_iomul_data *iomul)
++{
++      BUG_ON(!mutex_is_locked(&iomul->lock));
++      BUG_ON(!mutex_is_locked(&iomul->sw->lock));
++      return pci_iomul_switch_io_allocated(iomul->sw) &&
++              *iomul->func != NULL;
++}
++
++static void __pci_iomul_enable_io(struct pci_dev *pdev)
++{
++      uint16_t cmd;
++
++      pci_dev_get(pdev);
++      pci_read_config_word(pdev, PCI_COMMAND, &cmd);
++      cmd |= PCI_COMMAND_IO;
++      pci_write_config_word(pdev, PCI_COMMAND, cmd);
++}
++
++static void __pci_iomul_disable_io(struct pci_iomul_data *iomul,
++                                 struct pci_dev *pdev)
++{
++      uint16_t cmd;
++
++      if (!pci_iomul_valid(iomul))
++              return;
++
++      pci_read_config_word(pdev, PCI_COMMAND, &cmd);
++      cmd &= ~PCI_COMMAND_IO;
++      pci_write_config_word(pdev, PCI_COMMAND, cmd);
++      pci_dev_put(pdev);
++}
++
++static int pci_iomul_open(struct inode *inode, struct file *filp)
++{
++      struct pci_iomul_data *iomul;
++      iomul = kmalloc(sizeof(*iomul), GFP_KERNEL);
++      if (iomul == NULL)
++              return -ENOMEM;
++
++      mutex_init(&iomul->lock);
++      iomul->pdev = NULL;
++      iomul->sw = NULL;
++      iomul->slot = NULL;
++      iomul->func = NULL;
++      filp->private_data = (void*)iomul;
++
++      return nonseekable_open(inode, filp);
++}
++
++static int pci_iomul_release(struct inode *inode, struct file *filp)
++{
++      struct pci_iomul_data *iomul =
++              (struct pci_iomul_data*)filp->private_data;
++      struct pci_iomul_switch *sw;
++      struct pci_iomul_slot *slot = NULL;
++
++      mutex_lock(&iomul->lock);
++      sw = iomul->sw;
++      slot = iomul->slot;
++      if (iomul->pdev != NULL) {
++              if (sw != NULL) {
++                      mutex_lock(&sw->lock);
++                      if (sw->current_pdev == iomul->pdev) {
++                              __pci_iomul_disable_io(iomul,
++                                                     sw->current_pdev);
++                              sw->current_pdev = NULL;
++                      }
++                      sw->count--;
++                      if (sw->count == 0) {
++                              release_region(sw->io_region->start, sw->io_region->end - sw->io_region->start + 1);
++                              sw->io_region = NULL;
++                      }
++                      mutex_unlock(&sw->lock);
++              }
++              pci_dev_put(iomul->pdev);
++      }
++      mutex_unlock(&iomul->lock);
++
++      if (slot != NULL)
++              pci_iomul_slot_put(slot);
++      if (sw != NULL)
++              pci_iomul_switch_put(sw);
++      kfree(iomul);
++      return 0;
++}
++
++static long pci_iomul_setup(struct pci_iomul_data *iomul,
++                          struct pci_iomul_setup __user *arg)
++{
++      long error = 0;
++      struct pci_iomul_setup setup;
++      struct pci_iomul_switch *sw = NULL;
++      struct pci_iomul_slot *slot;
++      struct pci_bus *pbus;
++      struct pci_dev *pdev;
++
++      if (copy_from_user(&setup, arg, sizeof(setup)))
++              return -EFAULT;
++
++      pbus = pci_find_bus(setup.segment, setup.bus);
++      if (pbus == NULL)
++              return -ENODEV;
++      pdev = pci_get_slot(pbus, setup.dev);
++      if (pdev == NULL)
++              return -ENODEV;
++
++      mutex_lock(&iomul->lock);
++      if (iomul->sw != NULL) {
++              error = -EBUSY;
++              goto out0;
++      }
++
++      pci_iomul_get_lock_switch(pdev, &sw, &slot);
++      if (sw == NULL || slot == NULL) {
++              error = -ENODEV;
++              goto out0;
++      }
++      if (!pci_iomul_switch_io_allocated(sw)) {
++              error = -ENODEV;
++              goto out;
++      }
++
++      if (slot->func[setup.func] == NULL) {
++              error = -ENODEV;
++              goto out;
++      }
++
++      if (sw->count == 0) {
++              BUG_ON(sw->io_region != NULL);
++              sw->io_region =
++                      request_region(sw->io_base,
++                                     sw->io_limit - sw->io_base + 1,
++                                     "PCI IO Multiplexer driver");
++              if (sw->io_region == NULL) {
++                      mutex_unlock(&sw->lock);
++                      error = -EBUSY;
++                      goto out;
++              }
++      }
++      sw->count++;
++      pci_iomul_slot_get(slot);
++
++      iomul->pdev = pdev;
++      iomul->sw = sw;
++      iomul->slot = slot;
++      iomul->func = &slot->func[setup.func];
++
++out:
++      mutex_unlock(&sw->lock);
++out0:
++      mutex_unlock(&iomul->lock);
++      if (error != 0) {
++              if (sw != NULL)
++                      pci_iomul_switch_put(sw);
++              pci_dev_put(pdev);
++      }
++      return error;
++}
++
++static int pci_iomul_lock(struct pci_iomul_data *iomul,
++                        struct pci_iomul_switch **sw,
++                        struct pci_iomul_func **func)
++{
++      mutex_lock(&iomul->lock);
++      *sw = iomul->sw;
++      if (*sw == NULL) {
++              mutex_unlock(&iomul->lock);
++              return -ENODEV;
++      }
++      mutex_lock(&(*sw)->lock);
++      if (!pci_iomul_valid(iomul)) {
++              mutex_unlock(&(*sw)->lock);
++              mutex_unlock(&iomul->lock);
++              return -ENODEV;
++      }
++      *func = *iomul->func;
++
++      return 0;
++}
++
++static long pci_iomul_disable_io(struct pci_iomul_data *iomul)
++{
++      long error = 0;
++      struct pci_iomul_switch *sw;
++      struct pci_iomul_func *dummy_func;
++      struct pci_dev *pdev;
++
++      if (pci_iomul_lock(iomul, &sw, &dummy_func) < 0)
++              return -ENODEV;
++
++      pdev = iomul->pdev;
++      if (pdev == NULL)
++              error = -ENODEV;
++
++      if (pdev != NULL && sw->current_pdev == pdev) {
++              __pci_iomul_disable_io(iomul, pdev);
++              sw->current_pdev = NULL;
++      }
++
++      mutex_unlock(&sw->lock);
++      mutex_unlock(&iomul->lock);
++      return error;
++}
++
++static void pci_iomul_switch_to(
++      struct pci_iomul_data *iomul, struct pci_iomul_switch *sw,
++      struct pci_dev *next_pdev)
++{
++      if (sw->current_pdev == next_pdev)
++              /* nothing to do */
++              return;
++
++      if (sw->current_pdev != NULL)
++              __pci_iomul_disable_io(iomul, sw->current_pdev);
++
++      __pci_iomul_enable_io(next_pdev);
++      sw->current_pdev = next_pdev;
++}
++
++static long pci_iomul_in(struct pci_iomul_data *iomul,
++                       struct pci_iomul_in __user *arg)
++{
++      struct pci_iomul_in in;
++      struct pci_iomul_switch *sw;
++      struct pci_iomul_func *func;
++
++      long error = 0;
++      int port;
++      uint32_t value = 0;
++
++      if (copy_from_user(&in, arg, sizeof(in)))
++              return -EFAULT;
++
++      if (pci_iomul_lock(iomul, &sw, &func) < 0)
++              return -ENODEV;
++
++      error = pci_iomul_func_ioport(func, in.bar, in.offset, &port);
++      if (error)
++              goto out;
++
++      pci_iomul_switch_to(iomul, sw, iomul->pdev);
++      switch (in.size) {
++      case 4:
++              value = inl(port);
++              break;
++      case 2:
++              value = inw(port);
++              break;
++      case 1:
++              value = inb(port);
++              break;
++      default:
++              error = -EINVAL;
++              break;
++      }
++
++out:
++      mutex_unlock(&sw->lock);
++      mutex_unlock(&iomul->lock);
++
++      if (error == 0 && put_user(value, &arg->value))
++              return -EFAULT;
++      return error;
++}
++
++static long pci_iomul_out(struct pci_iomul_data *iomul,
++                        struct pci_iomul_out __user *arg)
++{
++      struct pci_iomul_in out;
++      struct pci_iomul_switch *sw;
++      struct pci_iomul_func *func;
++
++      long error = 0;
++      int port;
++
++      if (copy_from_user(&out, arg, sizeof(out)))
++              return -EFAULT;
++
++      if (pci_iomul_lock(iomul, &sw, &func) < 0)
++              return -ENODEV;
++
++      error = pci_iomul_func_ioport(func, out.bar, out.offset, &port);
++      if (error)
++              goto out;
++
++      pci_iomul_switch_to(iomul, sw, iomul->pdev);
++      switch (out.size) {
++      case 4:
++              outl(out.value, port);
++              break;
++      case 2:
++              outw(out.value, port);
++              break;
++      case 1:
++              outb(out.value, port);
++              break;
++      default:
++              error = -EINVAL;
++              break;
++      }
++
++out:
++      mutex_unlock(&sw->lock);
++      mutex_unlock(&iomul->lock);
++      return error;
++}
++
++static long pci_iomul_ioctl(struct file *filp,
++                          unsigned int cmd, unsigned long arg)
++{
++      long error;
++      struct pci_iomul_data *iomul =
++              (struct pci_iomul_data*)filp->private_data;
++
++      if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
++              return -EPERM;
++
++      switch (cmd) {
++      case PCI_IOMUL_SETUP:
++              error = pci_iomul_setup(iomul,
++                                      (struct pci_iomul_setup __user *)arg);
++              break;
++      case PCI_IOMUL_DISABLE_IO:
++              error = pci_iomul_disable_io(iomul);
++              break;
++      case PCI_IOMUL_IN:
++              error = pci_iomul_in(iomul, (struct pci_iomul_in __user *)arg);
++              break;
++      case PCI_IOMUL_OUT:
++              error = pci_iomul_out(iomul,
++                                    (struct pci_iomul_out __user *)arg);
++              break;
++      default:
++              error = -ENOSYS;
++              break;
++      }
++
++      return error;
++}
++
++static const struct file_operations pci_iomul_fops = {
++      .owner = THIS_MODULE,
++
++      .open = pci_iomul_open,
++      .release = pci_iomul_release,
++
++      .unlocked_ioctl = pci_iomul_ioctl,
++};
++
++static struct miscdevice pci_iomul_miscdev = {
++      .minor = MISC_DYNAMIC_MINOR,
++      .name = "pci_iomul",
++      .nodename = "xen/pci_iomul",
++      .fops = &pci_iomul_fops,
++};
++
++static int __init pci_iomul_init(void)
++{
++      int error;
++
++      error = misc_register(&pci_iomul_miscdev);
++      if (error != 0) {
++              pr_alert("Couldn't register /dev/xen/pci_iomul");
++              return error;
++      }
++      pr_info("PCI IO multiplexer device installed\n");
++      return 0;
++}
++
++#ifdef MODULE
++static void __exit pci_iomul_cleanup(void)
++{
++      misc_deregister(&pci_iomul_miscdev);
++}
++module_exit(pci_iomul_cleanup);
++#endif
++
++/*
++ * This must be called after pci fixup final which is called by
++ * device_initcall(pci_init).
++ */
++late_initcall(pci_iomul_init);
++
++MODULE_ALIAS("devname:xen/pci_iomul");
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Isaku Yamahata <yamahata@valinux.co.jp>");
++MODULE_DESCRIPTION("PCI IO space multiplexing driver");
diff --cc drivers/pci/pci.c

index 56098b3,2472e71..31faf34
--- 1/drivers/pci/pci.c
--- 2/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@@ -471,7 -471,7 +471,12 @@@ pci_find_parent_resource(const struct p
    * Restore the BAR values for a given device, so as to make it
    * accessible by its driver.
    */
++#ifndef CONFIG_XEN
   static void
++#else
++EXPORT_SYMBOL_GPL(pci_restore_bars);
++void
++#endif
   pci_restore_bars(struct pci_dev *dev)
   {
         int i;
@@@ -3402,6 -2990,6 +2995,13 @@@ resource_size_t pci_specified_resource_
    */
   int pci_is_reassigndev(struct pci_dev *dev)
   {
++#ifdef CONFIG_PCI_GUESTDEV
++      int result;
++
++      result = pci_is_guestdev_to_reassign(dev);
++      if (result)
++              return result;
++#endif /* CONFIG_PCI_GUESTDEV */
         return (pci_specified_resource_alignment(dev) != 0);
   }
   
diff --cc drivers/pci/pci.h

index 731e202,a6ec200..5c453ce
--- 1/drivers/pci/pci.h
--- 2/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@@ -312,4 -350,4 +350,26 @@@ static inline int pci_dev_specific_rese
   }
   #endif
   
++#ifdef CONFIG_PCI_GUESTDEV
++extern int pci_is_guestdev_to_reassign(struct pci_dev *dev);
++extern int pci_is_iomuldev(struct pci_dev *dev);
++#else
++#define pci_is_iomuldev(dev)  0
++#endif
++
++#ifdef CONFIG_PCI_RESERVE
++unsigned long pci_reserve_size_io(struct pci_bus *bus);
++unsigned long pci_reserve_size_mem(struct pci_bus *bus);
++#else
++static inline unsigned long pci_reserve_size_io(struct pci_bus *bus)
++{
++      return 0;
++}
++
++static inline unsigned long pci_reserve_size_mem(struct pci_bus *bus)
++{
++      return 0;
++}
++#endif /* CONFIG_PCI_RESERVE */
++
   #endif /* DRIVERS_PCI_H */
diff --cc drivers/pci/probe.c

index 48849ff,44cbbba..afa34b1
--- 1/drivers/pci/probe.c
--- 2/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@@ -1180,6 -1216,6 +1216,11 @@@ static void pci_init_capabilities(struc
         /* Vital Product Data */
         pci_vpd_pci22_init(dev);
   
++#ifdef CONFIG_XEN
++      if (!is_initial_xendomain())
++              return;
++#endif
++
         /* Alternative Routing-ID Forwarding */
         pci_enable_ari(dev);
   
@@@ -1301,13 -1337,13 +1342,20 @@@ int pci_scan_slot(struct pci_bus *bus, 
                 return 0; /* Already scanned the entire slot */
   
         dev = pci_scan_single_device(bus, devfn);
--      if (!dev)
++      if (!dev) {
++#ifdef pcibios_scan_all_fns
++              if (!pcibios_scan_all_fns(bus, devfn))
++#endif
                 return 0;
--      if (!dev->is_added)
++      } else if (!dev->is_added)
                 nr++;
   
         if (pci_ari_enabled(bus))
                 next_fn = next_ari_fn;
++#ifdef pcibios_scan_all_fns
++      else if (pcibios_scan_all_fns(bus, devfn))
++              next_fn = next_trad_fn;
++#endif
         else if (dev->multifunction)
                 next_fn = next_trad_fn;
   
diff --cc drivers/pci/reserve.c

index 0000000,0000000..97b23fb

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/pci/reserve.c
@@@ -1,0 -1,0 +1,137 @@@
++/*
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
++ *
++ * Copyright (c) 2009 Isaku Yamahata
++ *                    VA Linux Systems Japan K.K.
++ */
++
++#include <linux/kernel.h>
++#include <linux/pci.h>
++
++#include <asm/setup.h>
++
++static char pci_reserve_param[COMMAND_LINE_SIZE];
++
++/* pci_reserve=       [PCI]
++ * Format: [<sbdf>[+IO<size>][+MEM<size>]][,<sbdf>...]
++ * Format of sbdf: [<segment>:]<bus>:<dev>.<func>
++ */
++static int pci_reserve_parse_size(const char *str,
++                                unsigned long *io_size,
++                                unsigned long *mem_size)
++{
++      if (sscanf(str, "io%lx", io_size) == 1 ||
++          sscanf(str, "IO%lx", io_size) == 1)
++              return 0;
++
++      if (sscanf(str, "mem%lx", mem_size) == 1 ||
++          sscanf(str, "MEM%lx", mem_size) == 1)
++              return 0;
++
++      return -EINVAL;
++}
++
++static int pci_reserve_parse_one(const char *str,
++                               int *seg, int *bus, int *dev, int *func,
++                               unsigned long *io_size,
++                               unsigned long *mem_size)
++{
++      char *p;
++
++      *io_size = 0;
++      *mem_size = 0;
++
++      if (sscanf(str, "%x:%x:%x.%x", seg, bus, dev, func) != 4) {
++              *seg = 0;
++              if (sscanf(str, "%x:%x.%x", bus, dev, func) != 3) {
++                      return -EINVAL;
++              }
++      }
++
++      p = strchr(str, '+');
++      if (p == NULL)
++              return -EINVAL;
++      if (pci_reserve_parse_size(++p, io_size, mem_size))
++              return -EINVAL;
++
++      p = strchr(p, '+');
++      return p ? pci_reserve_parse_size(p + 1, io_size, mem_size) : 0;
++}
++
++static unsigned long pci_reserve_size(struct pci_bus *pbus, int flags)
++{
++      char *sp;
++      char *ep;
++
++      int seg;
++      int bus;
++      int dev;
++      int func;
++
++      unsigned long io_size;
++      unsigned long mem_size;
++
++      sp = pci_reserve_param;
++
++      do {
++              ep = strchr(sp, ',');
++              if (ep)
++                      *ep = '\0';     /* chomp */
++
++              if (pci_reserve_parse_one(sp, &seg, &bus, &dev, &func,
++                                        &io_size, &mem_size) == 0) {
++                      if (pci_domain_nr(pbus) == seg &&
++                          pbus->number == bus &&
++                          PCI_SLOT(pbus->self->devfn) == dev &&
++                          PCI_FUNC(pbus->self->devfn) == func) {
++                              switch (flags) {
++                              case IORESOURCE_IO:
++                                      return io_size;
++                              case IORESOURCE_MEM:
++                                      return mem_size;
++                              default:
++                                      break;
++                              }
++                      }
++              }
++
++              if (ep) {
++                      *ep = ',';      /* restore chomp'ed ',' for later */
++                      ep++;
++              }
++              sp = ep;
++      } while (ep);
++
++      return 0;
++}
++
++unsigned long pci_reserve_size_io(struct pci_bus *pbus)
++{
++      return pci_reserve_size(pbus, IORESOURCE_IO);
++}
++
++unsigned long pci_reserve_size_mem(struct pci_bus *pbus)
++{
++      return pci_reserve_size(pbus, IORESOURCE_MEM);
++}
++
++static int __init pci_reserve_setup(char *str)
++{
++      if (strlen(str) >= sizeof(pci_reserve_param))
++              return 0;
++      strlcpy(pci_reserve_param, str, sizeof(pci_reserve_param));
++      return 1;
++}
++__setup("pci_reserve=", pci_reserve_setup);
diff --cc drivers/pci/setup-bus.c

index 1e9e5a5,a806cb3..55077c4
--- 1/drivers/pci/setup-bus.c
--- 2/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@@ -554,7 -554,7 +554,7 @@@ static void pbus_size_io(struct pci_bu
   {
         struct pci_dev *dev;
         struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
--      unsigned long size = 0, size0 = 0, size1 = 0;
++      unsigned long size = 0, size0 = 0, size1 = 0, res_size;
   
         if (!b_res)
                 return;
@@@ -582,6 -582,6 +582,11 @@@
         size1 = (!add_head || (add_head && !add_size)) ? size0 :
                 calculate_iosize(size, min_size+add_size, size1,
                         resource_size(b_res), 4096);
++      res_size = pci_reserve_size_io(bus);
++      if (size0 < res_size)
++              size0 = ALIGN(res_size, 4096);
++      if (size1 < res_size)
++              size1 = ALIGN(res_size, 4096);
         if (!size0 && !size1) {
                 if (b_res->start || b_res->end)
                         dev_info(&bus->self->dev, "disabling bridge window "
@@@ -676,6 -676,6 +681,7 @@@ static int pbus_size_mem(struct pci_bu
                         min_align = align1 >> 1;
                 align += aligns[order];
         }
++      size = max(size, (resource_size_t)pci_reserve_size_mem(bus));
         size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align);
         size1 = (!add_head || (add_head && !add_size)) ? size0 :
                 calculate_memsize(size, min_size+add_size, 0,
diff --cc drivers/pci/xen-pcifront.c

index 492b7d8,492b7d8..0c2acbc
--- 1/drivers/pci/xen-pcifront.c
--- 2/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@@ -1129,7 -1129,7 +1129,6 @@@ static const struct xenbus_device_id xe
   
   static struct xenbus_driver xenbus_pcifront_driver = {
         .name                   = "pcifront",
--      .owner                  = THIS_MODULE,
         .ids                    = xenpci_ids,
         .probe                  = pcifront_xenbus_probe,
         .remove                 = pcifront_xenbus_remove,
diff --cc drivers/scsi/Kconfig

index 4a1f029,4a1f029..3019056
--- 1/drivers/scsi/Kconfig
--- 2/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@@ -660,7 -660,7 +660,7 @@@ config SCSI_FLASHPOIN
   
   config VMWARE_PVSCSI
         tristate "VMware PVSCSI driver support"
--      depends on PCI && SCSI && X86
++      depends on PCI && SCSI && !XEN && X86
         help
           This driver supports VMware's para virtualized SCSI HBA.
           To compile this driver as a module, choose M here: the
diff --cc drivers/scsi/arcmsr/arcmsr.h

index 77b26f5,77b26f5..302efaf
--- 1/drivers/scsi/arcmsr/arcmsr.h
--- 2/drivers/scsi/arcmsr/arcmsr.h
+++ b/drivers/scsi/arcmsr/arcmsr.h
@@@ -46,7 -46,7 +46,7 @@@
   struct device_attribute;
   /*The limit of outstanding scsi command that firmware can handle*/
   #define ARCMSR_MAX_OUTSTANDING_CMD                                            256
--#ifdef CONFIG_XEN
++#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
         #define ARCMSR_MAX_FREECCB_NUM  160
   #else
         #define ARCMSR_MAX_FREECCB_NUM  320
diff --cc drivers/scsi/device_handler/scsi_dh_alua.c
Simple merge
diff --cc drivers/scsi/device_handler/scsi_dh_rdac.c
Simple merge
diff --cc drivers/scsi/ibmvscsi/ibmvscsi.c
Simple merge
diff --cc drivers/scsi/megaraid/megaraid_mbox.c
Simple merge
diff --cc drivers/scsi/scsi_error.c
Simple merge
diff --cc drivers/scsi/scsi_proc.c

index ad747dc,c99da92..cd18b60
--- 1/drivers/scsi/scsi_proc.c
--- 2/drivers/scsi/scsi_proc.c
+++ b/drivers/scsi/scsi_proc.c
@@@ -381,59 -381,18 +381,64 @@@ static ssize_t proc_scsi_write(struct f
         return err;
   }
   
+ /**
+  * proc_scsi_show - show contents of /proc/scsi/scsi (attached devices)
+  * @s: output goes here
+  * @p: not used
+  */
- -static int proc_scsi_show(struct seq_file *s, void *p)
+ +static int always_match(struct device *dev, void *data)
   {
- -      seq_printf(s, "Attached devices:\n");
- -      bus_for_each_dev(&scsi_bus_type, NULL, s, proc_print_scsidevice);
- -      return 0;
+ +      return 1;
+ +}
+ +
+ +static inline struct device *next_scsi_device(struct device *start)
+ +{
+ +      struct device *next = bus_find_device(&scsi_bus_type, start, NULL,
+ +                                            always_match);
+ +      put_device(start);
+ +      return next;
   }
   
+ +static void *scsi_seq_start(struct seq_file *sfile, loff_t *pos)
+ +{
+ +      struct device *dev = NULL;
+ +      loff_t n = *pos;
+ +
+ +      while ((dev = next_scsi_device(dev))) {
+ +              if (!n--)
+ +                      break;
+ +              sfile->private++;
+ +      }
+ +      return dev;
+ +}
+ +
+ +static void *scsi_seq_next(struct seq_file *sfile, void *v, loff_t *pos)
+ +{
+ +      (*pos)++;
+ +      sfile->private++;
+ +      return next_scsi_device(v);
+ +}
+ +
+ +static void scsi_seq_stop(struct seq_file *sfile, void *v)
+ +{
+ +      put_device(v);
+ +}
+ +
+ +static int scsi_seq_show(struct seq_file *sfile, void *dev)
+ +{
+ +      if (!sfile->private)
+ +              seq_puts(sfile, "Attached devices:\n");
+ +
+ +      return proc_print_scsidevice(dev, sfile);
+ +}
+ +
- static const struct seq_operations scsi_seq_ops = {
++static struct seq_operations scsi_seq_ops = {
+ +      .start  = scsi_seq_start,
+ +      .next   = scsi_seq_next,
+ +      .stop   = scsi_seq_stop,
+ +      .show   = scsi_seq_show
+ +};
+ +
   /**
    * proc_scsi_open - glue function
    * @inode: not used
diff --cc drivers/scsi/sd.c
Simple merge
diff --cc drivers/sfi/sfi_core.c

index 1e824fb,1e824fb..5d34c8b
--- 1/drivers/sfi/sfi_core.c
--- 2/drivers/sfi/sfi_core.c
+++ b/drivers/sfi/sfi_core.c
@@@ -486,6 -486,6 +486,11 @@@ void __init sfi_init(void
         if (!acpi_disabled)
                 disable_sfi();
   
++#ifdef CONFIG_XEN
++      if (!is_initial_xendomain())
++              disable_sfi();
++#endif
++
         if (sfi_disabled)
                 return;
   
diff --cc drivers/staging/hv/Kconfig

index 5e0c9f6,d41f380..745f69e
--- 1/drivers/staging/hv/Kconfig
--- 2/drivers/staging/hv/Kconfig
+++ b/drivers/staging/hv/Kconfig
@@@ -1,6 -1,6 +1,6 @@@
   config HYPERV
         tristate "Microsoft Hyper-V client drivers"
-       depends on X86 && ACPI && PCI && m
- -      depends on X86 && m
++      depends on X86 && !XEN && m
         default n
         help
           Select this option to run Linux as a Hyper-V client operating
diff --cc drivers/staging/vt6655/ttype.h

index be223bd,be223bd..cf620dc
--- 1/drivers/staging/vt6655/ttype.h
--- 2/drivers/staging/vt6655/ttype.h
+++ b/drivers/staging/vt6655/ttype.h
@@@ -30,6 -30,6 +30,9 @@@
   #ifndef __TTYPE_H__
   #define __TTYPE_H__
   
++#ifdef CONFIG_XEN
++#include <asm/hypervisor.h>
++#endif
   
   /******* Common definitions and typedefs ***********************************/
   
diff --cc drivers/staging/vt6656/ttype.h

index 8e9450e,8e9450e..07e023b
--- 1/drivers/staging/vt6656/ttype.h
--- 2/drivers/staging/vt6656/ttype.h
+++ b/drivers/staging/vt6656/ttype.h
@@@ -29,6 -29,6 +29,10 @@@
   #ifndef __TTYPE_H__
   #define __TTYPE_H__
   
++#ifdef CONFIG_XEN
++#include <asm/hypervisor.h>
++#endif
++
   /******* Common definitions and typedefs ***********************************/
   
   typedef int             BOOL;
diff --cc drivers/tty/hvc/Kconfig

index 6f2c980,6f2c980..ab625cb
--- 1/drivers/tty/hvc/Kconfig
--- 2/drivers/tty/hvc/Kconfig
+++ b/drivers/tty/hvc/Kconfig
@@@ -55,7 -55,7 +55,7 @@@ config HVC_IUC
   
   config HVC_XEN
         bool "Xen Hypervisor Console support"
--      depends on XEN
++      depends on PARAVIRT_XEN
         select HVC_DRIVER
         select HVC_IRQ
         default y
diff --cc drivers/tty/n_tty.c
Simple merge
diff --cc drivers/tty/serial/8250.c
Simple merge
diff --cc drivers/tty/serial/Kconfig

index 636144c,80484af..4976976
--- 1/drivers/tty/serial/Kconfig
--- 2/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@@ -9,6 -9,6 +9,7 @@@ menu "Serial drivers
   # The new 8250/16550 serial drivers
   config SERIAL_8250
         tristate "8250/16550 and compatible serial support"
++      depends on !XEN_DISABLE_SERIAL
         select SERIAL_CORE
         ---help---
           This selects whether you want to include the driver for the standard
diff --cc drivers/tty/tty_io.c

index 6556f74,d7d50b4..967543b
--- 1/drivers/tty/tty_io.c
--- 2/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@@ -137,6 -139,6 +139,8 @@@ EXPORT_SYMBOL(tty_mutex)
   /* Spinlock to protect the tty->tty_files list */
   DEFINE_SPINLOCK(tty_files_lock);
   
++int console_use_vt = 1;
++
   static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *);
   static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *);
   ssize_t redirected_tty_write(struct file *, const char __user *,
@@@ -1835,7 -1833,7 +1835,7 @@@ retry_open
                 goto got_driver;
         }
   #ifdef CONFIG_VT
--      if (device == MKDEV(TTY_MAJOR, 0)) {
++      if (console_use_vt && device == MKDEV(TTY_MAJOR, 0)) {
                 extern struct tty_driver *console_driver;
                 driver = tty_driver_kref_get(console_driver);
                 index = fg_console;
@@@ -3327,7 -3312,7 +3314,8 @@@ int __init tty_init(void
                 WARN_ON(device_create_file(consdev, &dev_attr_active) < 0);
   
   #ifdef CONFIG_VT
--      vty_init(&console_fops);
++      if (console_use_vt)
++              vty_init(&console_fops);
   #endif
         return 0;
   }
diff --cc drivers/tty/vt/keyboard.c
Simple merge
diff --cc drivers/tty/vt/vt.c
Simple merge
diff --cc drivers/video/Kconfig

index ad9452d,e6a8d8c..30529d6
--- 1/drivers/video/Kconfig
--- 2/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@@ -2237,7 -2209,7 +2209,7 @@@ config FB_VIRTUA
   
   config XEN_FBDEV_FRONTEND
         tristate "Xen virtual frame buffer support"
--      depends on FB && XEN
++      depends on FB && PARAVIRT_XEN
         select FB_SYS_FILLRECT
         select FB_SYS_COPYAREA
         select FB_SYS_IMAGEBLIT
diff --cc drivers/video/Makefile
Simple merge
diff --cc drivers/video/xen-fbfront.c

index a20218c,a20218c..68f7806
--- 1/drivers/video/xen-fbfront.c
--- 2/drivers/video/xen-fbfront.c
+++ b/drivers/video/xen-fbfront.c
@@@ -679,7 -679,7 +679,6 @@@ static struct xenbus_device_id xenfb_id
   
   static struct xenbus_driver xenfb_driver = {
         .name = "vfb",
--      .owner = THIS_MODULE,
         .ids = xenfb_ids,
         .probe = xenfb_probe,
         .remove = xenfb_remove,
diff --cc drivers/watchdog/Kconfig

index 022f9eb,1b0f98b..edee30b
--- 1/drivers/watchdog/Kconfig
--- 2/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@@ -1139,7 -1133,7 +1133,7 @@@ config WATCHDOG_RI
   
   config XEN_WDT
         tristate "Xen Watchdog support"
--      depends on XEN
++      depends on XEN || PARAVIRT_XEN
         help
           Say Y here to support the hypervisor watchdog capability provided
           by Xen 4.0 and newer.  The watchdog timeout period is normally one
diff --cc drivers/watchdog/xen_wdt.c

index 49bd9d3,49bd9d3..0c619dd
--- 1/drivers/watchdog/xen_wdt.c
--- 2/drivers/watchdog/xen_wdt.c
+++ b/drivers/watchdog/xen_wdt.c
@@@ -1,7 -1,7 +1,7 @@@
   /*
    *    Xen Watchdog Driver
    *
-- *    (c) Copyright 2010 Novell, Inc.
++ *    (c) Copyright 2010,2011 Novell, Inc.
    *
    *    This program is free software; you can redistribute it and/or
    *    modify it under the terms of the GNU General Public License
@@@ -11,7 -11,7 +11,7 @@@
   
   #define DRV_NAME      "wdt"
   #define DRV_VERSION   "0.01"
--#define PFX           DRV_NAME ": "
++#define pr_fmt(fmt)   KBUILD_MODNAME ": " fmt
   
   #include <linux/bug.h>
   #include <linux/errno.h>
@@@ -27,8 -27,8 +27,10 @@@
   #include <linux/spinlock.h>
   #include <linux/uaccess.h>
   #include <linux/watchdog.h>
++#ifdef CONFIG_PARAVIRT_XEN
   #include <xen/xen.h>
   #include <asm/xen/hypercall.h>
++#endif
   #include <xen/interface/sched.h>
   
   static struct platform_device *platform_device;
@@@ -134,8 -134,8 +136,7 @@@ static int xen_wdt_release(struct inod
         if (expect_release)
                 xen_wdt_stop();
         else {
--              printk(KERN_CRIT PFX
--                     "unexpected close, not stopping watchdog!\n");
++              pr_crit("unexpected close, not stopping watchdog!\n");
                 xen_wdt_kick();
         }
         is_active = false;
@@@ -251,30 -251,30 +252,27 @@@ static int __devinit xen_wdt_probe(stru
         case -EINVAL:
                 if (!timeout) {
                         timeout = WATCHDOG_TIMEOUT;
--                      printk(KERN_INFO PFX
--                             "timeout value invalid, using %d\n", timeout);
++                      pr_info("timeout value invalid, using %d\n", timeout);
                 }
   
                 ret = misc_register(&xen_wdt_miscdev);
                 if (ret) {
--                      printk(KERN_ERR PFX
--                             "cannot register miscdev on minor=%d (%d)\n",
++                      pr_err("cannot register miscdev on minor=%d (%d)\n",
                                WATCHDOG_MINOR, ret);
                         break;
                 }
   
--              printk(KERN_INFO PFX
--                     "initialized (timeout=%ds, nowayout=%d)\n",
--                     timeout, nowayout);
++              pr_info("initialized (timeout=%ds, nowayout=%d)\n",
++                      timeout, nowayout);
                 break;
   
         case -ENOSYS:
--              printk(KERN_INFO PFX "not supported\n");
++              pr_info("not supported\n");
                 ret = -ENODEV;
                 break;
   
         default:
--              printk(KERN_INFO PFX "bogus return value %d\n", ret);
++              pr_info("bogus return value %d\n", ret);
                 break;
         }
   
@@@ -323,17 -323,17 +321,19 @@@ static int __init xen_wdt_init_module(v
   {
         int err;
   
++#ifdef CONFIG_PARAVIRT_XEN
         if (!xen_domain())
                 return -ENODEV;
++#endif
   
--      printk(KERN_INFO PFX "Xen WatchDog Timer Driver v%s\n", DRV_VERSION);
++      printk(KERN_INFO "Xen WatchDog Timer Driver v%s\n", DRV_VERSION);
   
         err = platform_driver_register(&xen_wdt_driver);
         if (err)
                 return err;
   
         platform_device = platform_device_register_simple(DRV_NAME,
--                                                                -1, NULL, 0);
++                                                        -1, NULL, 0);
         if (IS_ERR(platform_device)) {
                 err = PTR_ERR(platform_device);
                 platform_driver_unregister(&xen_wdt_driver);
@@@ -346,7 -346,7 +346,7 @@@ static void __exit xen_wdt_cleanup_modu
   {
         platform_device_unregister(platform_device);
         platform_driver_unregister(&xen_wdt_driver);
--      printk(KERN_INFO PFX "module unloaded\n");
++      pr_info("module unloaded\n");
   }
   
   module_init(xen_wdt_init_module);
diff --cc drivers/xen/Kconfig

index a59638b,a59638b..aa47afb
--- 1/drivers/xen/Kconfig
--- 2/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@@ -1,8 -1,8 +1,423 @@@
++#
++# This Kconfig describe xen options
++#
++
++config XEN
++      bool
++
++if XEN
++config XEN_INTERFACE_VERSION
++      hex
++      default 0x00030207
++
++menu "XEN"
++
++config XEN_PRIVILEGED_GUEST
++      bool "Privileged Guest (domain 0)"
++      help
++        Support for privileged operation (domain 0)
++
++config XEN_UNPRIVILEGED_GUEST
++      def_bool y
++      depends on !XEN_PRIVILEGED_GUEST
++      select PM
++      select SUSPEND
++
++config XEN_PRIVCMD
++      def_bool y
++      depends on PROC_FS
++
++config XEN_DOMCTL
++      tristate
++
++config XEN_XENBUS_DEV
++      def_bool y
++      depends on PROC_FS
++
++config XEN_NETDEV_ACCEL_SFC_UTIL
++      depends on X86
++      tristate
++
++config XEN_BACKEND
++        tristate "Backend driver support"
++        default XEN_PRIVILEGED_GUEST
++        help
++          Support for backend device drivers that provide I/O services
++          to other virtual machines.
++
++config XEN_BLKDEV_BACKEND
++      tristate "Block-device backend driver"
++      depends on BLOCK && XEN_BACKEND
++      default XEN_BACKEND
++      select XEN_DOMCTL
++      help
++        The block-device backend driver allows the kernel to export its
++        block devices to other guests via a high-performance shared-memory
++        interface.
++
++config XEN_BLKDEV_TAP
++      tristate "Block-device tap backend driver"
++      depends on BLOCK && XEN_BACKEND
++      default XEN_BACKEND
++      select XEN_DOMCTL
++      help
++        The block tap driver is an alternative to the block back driver
++        and allows VM block requests to be redirected to userspace through
++        a device interface.  The tap allows user-space development of
++        high-performance block backends, where disk images may be implemented
++        as files, in memory, or on other hosts across the network.  This
++        driver can safely coexist with the existing blockback driver.
++
++config XEN_BLKDEV_TAP2
++      tristate "Block-device tap backend driver 2"
++      depends on BLOCK && XEN_BACKEND
++      default XEN_BACKEND
++      help
++        The block tap driver is an alternative to the block back driver
++        and allows VM block requests to be redirected to userspace through
++        a device interface.  The tap allows user-space development of
++        high-performance block backends, where disk images may be implemented
++        as files, in memory, or on other hosts across the network.  This
++        driver can safely coexist with the existing blockback driver.
++
++config XEN_NR_TAP2_DEVICES
++      int "Number of devices the version 2 tap backend driver can handle"
++      range 2 1048575
++      default 1024 if 64BIT
++      default 256
++      depends on XEN_BLKDEV_TAP2
++      help
++        This sets the number of backend devices the v2 tap backend driver
++        will be able to handle simultaneously. Note that device 0 is the
++        control device and hence not available to service guests.
++
++config XEN_BLKBACK_PAGEMAP
++      tristate
++      depends on XEN_BLKDEV_BACKEND != n && XEN_BLKDEV_TAP2 != n
++      default XEN_BLKDEV_BACKEND || XEN_BLKDEV_TAP2
++
++config XEN_NETDEV_BACKEND
++      tristate "Network-device backend driver"
++        depends on XEN_BACKEND && NET
++      default XEN_BACKEND
++      help
++        The network-device backend driver allows the kernel to export its
++        network devices to other guests via a high-performance shared-memory
++        interface.
++
++config XEN_NETDEV_TX_SHIFT
++      int "Maximum simultaneous transmit requests (as a power of 2)"
++      depends on XEN_NETDEV_BACKEND
++      range 5 16
++      default 8
++      help
++        The maximum number transmits the driver can hold pending, expressed
++        as the exponent of a power of 2.
++
++config XEN_NETDEV_PIPELINED_TRANSMITTER
++      bool "Pipelined transmitter (DANGEROUS)"
++      depends on XEN_NETDEV_BACKEND
++      help
++        If the net backend is a dumb domain, such as a transparent Ethernet
++        bridge with no local IP interface, it is safe to say Y here to get
++        slightly lower network overhead.
++        If the backend has a local IP interface; or may be doing smart things
++        like reassembling packets to perform firewall filtering; or if you
++        are unsure; or if you experience network hangs when this option is
++        enabled; then you must say N here.
++
++config XEN_NETDEV_ACCEL_SFC_BACKEND
++      tristate "Network-device backend driver acceleration for Solarflare NICs"
++      depends on XEN_NETDEV_BACKEND && SFC && SFC_RESOURCE && X86
++      select XEN_NETDEV_ACCEL_SFC_UTIL
++      default m
++
++config XEN_NETDEV_LOOPBACK
++      tristate "Network-device loopback driver"
++      depends on XEN_NETDEV_BACKEND
++      help
++        A two-interface loopback device to emulate a local netfront-netback
++        connection. If unsure, it is probably safe to say N here.
++
++config XEN_PCIDEV_BACKEND
++      tristate "PCI-device backend driver"
++      depends on PCI && XEN_PRIVILEGED_GUEST && XEN_BACKEND
++      default XEN_BACKEND
++      help
++        The PCI device backend driver allows the kernel to export arbitrary
++        PCI devices to other guests. If you select this to be a module, you
++        will need to make sure no other driver has bound to the device(s)
++        you want to make visible to other guests.
++
++choice
++      prompt "PCI Backend Mode"
++      depends on XEN_PCIDEV_BACKEND
++      default XEN_PCIDEV_BACKEND_CONTROLLER if IA64
++      default XEN_PCIDEV_BACKEND_VPCI
++
++config XEN_PCIDEV_BACKEND_VPCI
++      bool "Virtual PCI"
++      ---help---
++        This PCI Backend hides the true PCI topology and makes the frontend
++        think there is a single PCI bus with only the exported devices on it.
++        For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
++        second device at 02:1a.1 will be re-assigned to 00:01.1.
++
++config XEN_PCIDEV_BACKEND_PASS
++      bool "Passthrough"
++      ---help---
++        This PCI Backend provides a real view of the PCI topology to the
++        frontend (for example, a device at 06:01.b will still appear at
++        06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
++        PCI devices to its driver domains. This may be required for drivers
++        which depend on finding their hardward in certain bus/slot
++        locations.
++
++config XEN_PCIDEV_BACKEND_SLOT
++      bool "Slot"
++      ---help---
++        This PCI Backend hides the true PCI topology and makes the frontend
++        think there is a single PCI bus with only the exported devices on it.
++        Contrary to the virtual PCI backend, a function becomes a new slot.
++        For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
++        second device at 02:1a.1 will be re-assigned to 00:01.0.
++
++config XEN_PCIDEV_BACKEND_CONTROLLER
++      bool "Controller"
++      depends on IA64
++      ---help---
++        This PCI backend virtualizes the PCI bus topology by providing a
++        virtual bus per PCI root device.  Devices which are physically under
++        the same root bus will appear on the same virtual bus.  For systems
++        with complex I/O addressing, this is the only backend which supports
++        extended I/O port spaces and MMIO translation offsets.  This backend
++        also supports slot virtualization.  For example, a device at
++        0000:01:02.1 will be re-assigned to 0000:00:00.0.  A second device
++        at 0000:02:05.0 (behind a P2P bridge on bus 0000:01) will be
++        re-assigned to 0000:00:01.0.  A third device at 0000:16:05.0 (under
++        a different PCI root bus) will be re-assigned to 0000:01:00.0.
++
++endchoice
++
++config XEN_PCIDEV_BE_DEBUG
++      bool "PCI Backend Debugging"
++      depends on XEN_PCIDEV_BACKEND
++
++config XEN_TPMDEV_BACKEND
++      tristate "TPM-device backend driver"
++        depends on XEN_BACKEND
++      help
++        The TPM-device backend driver
++
++config XEN_SCSI_BACKEND
++      tristate "SCSI backend driver"
++      depends on SCSI && XEN_BACKEND
++      default m
++      help
++        The SCSI backend driver allows the kernel to export its SCSI Devices
++        to other guests via a high-performance shared-memory interface.
++
++config XEN_USB_BACKEND
++      tristate "USB backend driver"
++      depends on USB && XEN_BACKEND
++      default m
++      help
++        The USB backend driver allows the kernel to export its USB Devices
++        to other guests.
++
++config XEN_BLKDEV_FRONTEND
++      tristate "Block-device frontend driver"
++      default y
++      help
++        The block-device frontend driver allows the kernel to access block
++        devices mounted within another guest OS. Unless you are building a
++        dedicated device-driver domain, or your master control domain
++        (domain 0), then you almost certainly want to say Y here.
++
++config XEN_NETDEV_FRONTEND
++      tristate "Network-device frontend driver"
++      depends on NET
++      default y
++      help
++        The network-device frontend driver allows the kernel to access
++        network interfaces within another guest OS. Unless you are building a
++        dedicated device-driver domain, or your master control domain
++        (domain 0), then you almost certainly want to say Y here.
++
++config XEN_NETDEV_ACCEL_SFC_FRONTEND
++      tristate "Network-device frontend driver acceleration for Solarflare NICs"
++      depends on XEN_NETDEV_FRONTEND && X86
++      select XEN_NETDEV_ACCEL_SFC_UTIL
++      default m
++
++config XEN_SCSI_FRONTEND
++      tristate "SCSI frontend driver"
++      depends on SCSI
++      default m
++      help
++        The SCSI frontend driver allows the kernel to access SCSI Devices
++        within another guest OS.
++
++config XEN_USB_FRONTEND
++      tristate "USB frontend driver"
++      depends on USB
++      default m
++      help
++        The USB frontend driver allows the kernel to access USB Devices
++        within another guest OS.
++
++config XEN_USB_FRONTEND_HCD_STATS
++      bool "Taking the HCD statistics (for debug)"
++      depends on XEN_USB_FRONTEND
++      default y
++      help
++        Count the transferred urb status and the RING_FULL occurrence.
++
++config XEN_USB_FRONTEND_HCD_PM
++      bool "HCD suspend/resume support (DO NOT USE)"
++      depends on XEN_USB_FRONTEND
++      default n
++      help
++        Experimental bus suspend/resume feature support.
++
++config XEN_GRANT_DEV
++      tristate "User-space granted page access driver"
++      depends on XEN_BACKEND != n
++      default XEN_PRIVILEGED_GUEST
++      help
++        Device for accessing (in user-space) pages that have been granted
++        by other domains.
++
++config XEN_FRAMEBUFFER
++      tristate "Framebuffer-device frontend driver"
++      depends on FB
++      select FB_CFB_FILLRECT
++      select FB_CFB_COPYAREA
++      select FB_CFB_IMAGEBLIT
++      default y
++      help
++        The framebuffer-device frontend drivers allows the kernel to create a
++        virtual framebuffer.  This framebuffer can be viewed in another
++        domain.  Unless this domain has access to a real video card, you
++        probably want to say Y here.
++
++config XEN_KEYBOARD
++      tristate "Keyboard-device frontend driver"
++      depends on XEN_FRAMEBUFFER && INPUT
++      default y
++      help
++        The keyboard-device frontend driver allows the kernel to create a
++        virtual keyboard.  This keyboard can then be driven by another
++        domain.  If you've said Y to CONFIG_XEN_FRAMEBUFFER, you probably
++        want to say Y here.
++
++config XEN_DISABLE_SERIAL
++      bool "Disable serial port drivers"
++      default y
++      help
++        Disable serial port drivers, allowing the Xen console driver
++        to provide a serial console at ttyS0.
++
++config XEN_SYSFS
++      tristate "Export Xen attributes in sysfs"
++      depends on SYSFS
++      select SYS_HYPERVISOR
++      default y
++      help
++        Xen hypervisor attributes will show up under /sys/hypervisor/.
++
++config XEN_NR_GUEST_DEVICES
++      int "Number of guest devices"
++      range 0 4032 if 64BIT
++      range 0 960
++      default 256 if XEN_BACKEND
++      default 16
++      help
++        Specify the total number of virtual devices (i.e. both frontend
++        and backend) that you want the kernel to be able to service.
++
++choice
++      prompt "Xen version compatibility"
++      default XEN_COMPAT_030002_AND_LATER
++
++      config XEN_COMPAT_030002_AND_LATER
++              bool "3.0.2 and later"
++
++      config XEN_COMPAT_030004_AND_LATER
++              bool "3.0.4 and later"
++
++      config XEN_COMPAT_030100_AND_LATER
++              bool "3.1.0 and later"
++
++      config XEN_COMPAT_030200_AND_LATER
++              bool "3.2.0 and later"
++
++      config XEN_COMPAT_030300_AND_LATER
++              bool "3.3.0 and later"
++
++      config XEN_COMPAT_030400_AND_LATER
++              bool "3.4.0 and later"
++
++      config XEN_COMPAT_040000_AND_LATER
++              bool "4.0.0 and later"
++
++      config XEN_COMPAT_040100_AND_LATER
++              bool "4.1.0 and later"
++
++      config XEN_COMPAT_LATEST_ONLY
++              bool "no compatibility code"
++
++endchoice
++
++config XEN_COMPAT
++      hex
++      default 0xffffff if XEN_COMPAT_LATEST_ONLY
++      default 0x040100 if XEN_COMPAT_040100_AND_LATER
++      default 0x040000 if XEN_COMPAT_040000_AND_LATER
++      default 0x030400 if XEN_COMPAT_030400_AND_LATER
++      default 0x030300 if XEN_COMPAT_030300_AND_LATER
++      default 0x030200 if XEN_COMPAT_030200_AND_LATER
++      default 0x030100 if XEN_COMPAT_030100_AND_LATER
++      default 0x030004 if XEN_COMPAT_030004_AND_LATER
++      default 0x030002 if XEN_COMPAT_030002_AND_LATER
++      default 0
++
++config XEN_VCPU_INFO_PLACEMENT
++      bool "Place shared vCPU info in per-CPU storage"
++#     depends on X86 && (XEN_COMPAT >= 0x00030101)
++      depends on X86
++      depends on !XEN_COMPAT_030002_AND_LATER
++      depends on !XEN_COMPAT_030004_AND_LATER
++      depends on !XEN_COMPAT_030100_AND_LATER
++      default SMP
++      ---help---
++        This allows faster access to the per-vCPU shared info
++        structure.
++
++endmenu
++
++config HAVE_IRQ_IGNORE_UNHANDLED
++      def_bool y
++
++config ARCH_HAS_WALK_MEMORY
++      def_bool y
++      depends on X86
++
++config XEN_SMPBOOT
++      def_bool y
++      depends on SMP && !PPC_XEN
++
++config XEN_DEVMEM
++      def_bool y
++
++endif
++
   menu "Xen driver support"
--      depends on XEN
++      depends on XEN || PARAVIRT_XEN
   
   config XEN_BALLOON
--      bool "Xen memory balloon driver"
++      bool "Xen memory balloon driver" if PARAVIRT_XEN
++      depends on PARAVIRT_XEN || !PPC_XEN
         default y
         help
           The balloon driver allows the Xen domain to request more memory from
@@@ -10,26 -10,26 +425,28 @@@
           return unneeded memory to the system.
   
   config XEN_SCRUB_PAGES
--      bool "Scrub pages before returning them to system"
--      depends on XEN_BALLOON
++      bool "Scrub memory before freeing it to Xen"
++      depends on XEN || XEN_BALLOON
         default y
         help
--        Scrub pages before returning them to the system for reuse by
--        other domains.  This makes sure that any confidential data
--        is not accidentally visible to other domains.  Is it more
--        secure, but slightly less efficient.
++        Erase memory contents before freeing it back to Xen's global
++        pool. This ensures that any secrets contained within that
++        memory (e.g., private keys) cannot be found by other guests that
++        may be running on the machine. Most people will want to say Y here.
++        If security is not a concern then you may increase performance by
++        saying N.
           If in doubt, say yes.
   
   config XEN_DEV_EVTCHN
         tristate "Xen /dev/xen/evtchn device"
--      default y
++      default PARAVIRT_XEN || XEN_PRIVILEGED_GUEST || m
         help
           The evtchn driver allows a userspace process to triger event
           channels and to receive notification of an event channel
           firing.
           If in doubt, say yes.
   
--config XEN_BACKEND
++config PARAVIRT_XEN_BACKEND
         bool "Backend driver support"
         depends on XEN_DOM0
         default y
@@@ -39,6 -39,6 +456,7 @@@
   
   config XENFS
         tristate "Xen filesystem"
++      depends on PARAVIRT_XEN
         default y
         help
           The xen filesystem provides a way for domains to share
@@@ -61,7 -61,7 +479,7 @@@ config XEN_COMPAT_XENF
   
   config XEN_SYS_HYPERVISOR
          bool "Create xen entries under /sys/hypervisor"
--       depends on SYSFS
++       depends on PARAVIRT_XEN && SYSFS
          select SYS_HYPERVISOR
          default y
          help
@@@ -75,7 -75,7 +493,7 @@@ config XEN_XENBUS_FRONTEN
   
   config XEN_GNTDEV
         tristate "userspace grant access device driver"
--      depends on XEN
++      depends on PARAVIRT_XEN
         default m
         select MMU_NOTIFIER
         help
@@@ -83,7 -83,7 +501,7 @@@
   
   config XEN_GRANT_DEV_ALLOC
         tristate "User-space grant reference allocator driver"
--      depends on XEN
++      depends on PARAVIRT_XEN
         default m
         help
           Allows userspace processes to create pages with access granted
@@@ -102,7 -102,7 +520,10 @@@ config XEN_PLATFORM_PC
   
   config SWIOTLB_XEN
         def_bool y
--      depends on PCI
++      depends on PARAVIRT_XEN && PCI
         select SWIOTLB
   
++config XEN_XENCOMM
++      bool
++
   endmenu
diff --cc drivers/xen/Makefile

index bbc1825,f420f1f..a7390c1
--- 1/drivers/xen/Makefile
--- 2/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@@ -1,25 -1,24 +1,60 @@@
--obj-y += grant-table.o features.o events.o manage.o balloon.o
--obj-y += xenbus/
- obj-y += tmem.o
++obj-$(CONFIG_PARAVIRT_XEN)    += grant-table.o features.o events.o manage.o balloon.o
++xen-biomerge-$(CONFIG_PARAVIRT_XEN) := biomerge.o
++xen-hotplug-$(CONFIG_PARAVIRT_XEN) := cpu_hotplug.o
++xen-balloon_$(CONFIG_PARAVIRT_XEN) := xen-balloon.o
++xen-evtchn-name-$(CONFIG_PARAVIRT_XEN) := xen-evtchn
++
++xen-balloon_$(CONFIG_XEN)     := balloon/
++obj-$(CONFIG_XEN)             += core/
++obj-$(CONFIG_XEN)             += console/
++obj-y                         += xenbus/
++obj-$(CONFIG_XEN)             += char/
++
++xen-backend-$(CONFIG_XEN_BACKEND)     := util.o
++xen-evtchn-name-$(CONFIG_XEN)         := evtchn
   
   nostackp := $(call cc-option, -fno-stack-protector)
++ifeq ($(CONFIG_PARAVIRT_XEN),y)
   CFLAGS_features.o                     := $(nostackp)
++endif
   
- obj-$(CONFIG_BLOCK)                   += biomerge.o
- obj-$(CONFIG_HOTPLUG_CPU)             += cpu_hotplug.o
- -obj-$(CONFIG_BLOCK)           += biomerge.o
- -obj-$(CONFIG_HOTPLUG_CPU)     += cpu_hotplug.o
- -obj-$(CONFIG_XEN_XENCOMM)     += xencomm.o
- -obj-$(CONFIG_XEN_BALLOON)     += xen-balloon.o
- -obj-$(CONFIG_XEN_DEV_EVTCHN)  += xen-evtchn.o
- -obj-$(CONFIG_XEN_GNTDEV)      += xen-gntdev.o
++obj-$(CONFIG_XEN)                     += features.o $(xen-backend-y) $(xen-backend-m)
++obj-$(CONFIG_BLOCK)                   += $(xen-biomerge-y)
++obj-$(CONFIG_HOTPLUG_CPU)             += $(xen-hotplug-y)
+ +obj-$(CONFIG_XEN_XENCOMM)             += xencomm.o
- obj-$(CONFIG_XEN_BALLOON)             += xen-balloon.o
- obj-$(CONFIG_XEN_DEV_EVTCHN)          += xen-evtchn.o
++obj-$(CONFIG_XEN_BALLOON)             += $(xen-balloon_y)
++obj-$(CONFIG_XEN_DEV_EVTCHN)          += $(xen-evtchn-name-y).o
+ +obj-$(CONFIG_XEN_GNTDEV)              += xen-gntdev.o
- obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)     += xen-gntalloc.o
+ +obj-$(CONFIG_XENFS)                   += xenfs/
+ obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)     += xen-gntalloc.o
- -obj-$(CONFIG_XENFS)           += xenfs/
   obj-$(CONFIG_XEN_SYS_HYPERVISOR)      += sys-hypervisor.o
- -obj-$(CONFIG_XEN_PLATFORM_PCI)        += xen-platform-pci.o
- -obj-$(CONFIG_SWIOTLB_XEN)     += swiotlb-xen.o
- -obj-$(CONFIG_XEN_DOM0)                += pci.o
+ +obj-$(CONFIG_XEN_PLATFORM_PCI)                += xen-platform-pci.o
+ +obj-$(CONFIG_SWIOTLB_XEN)             += swiotlb-xen.o
+ +obj-$(CONFIG_XEN_DOM0)                        += pci.o
   
- -xen-evtchn-y                  := evtchn.o
+ +xen-evtchn-y                          := evtchn.o
   xen-gntdev-y                          := gntdev.o
   xen-gntalloc-y                                := gntalloc.o
   
- -xen-platform-pci-y            := platform-pci.o
+ +xen-platform-pci-y                    := platform-pci.o
++
++obj-$(CONFIG_XEN_BLKDEV_BACKEND)      += blkback/
++obj-$(CONFIG_XEN_BLKDEV_TAP)          += blktap/
++obj-$(CONFIG_XEN_BLKDEV_TAP2)         += blktap2/ blktap2-new/
++obj-$(CONFIG_XEN_NETDEV_BACKEND)      += netback/
++obj-$(CONFIG_XEN_TPMDEV_BACKEND)      += tpmback/
++obj-$(CONFIG_XEN_BLKDEV_FRONTEND)     += blkfront/
++obj-$(CONFIG_XEN_NETDEV_FRONTEND)     += netfront/
++obj-$(CONFIG_XEN_PCIDEV_BACKEND)      += pciback/
++obj-$(CONFIG_XEN_PCIDEV_FRONTEND)     += pcifront/
++obj-$(CONFIG_XEN_FRAMEBUFFER)         += fbfront/
++obj-$(CONFIG_XEN_KEYBOARD)            += fbfront/
++obj-$(CONFIG_XEN_SCSI_BACKEND)                += scsiback/
++obj-$(CONFIG_XEN_SCSI_FRONTEND)               += scsifront/
++obj-$(CONFIG_XEN_USB_BACKEND)         += usbback/
++obj-$(CONFIG_XEN_USB_FRONTEND)                += usbfront/
++obj-$(CONFIG_XEN_PRIVCMD)     += privcmd/
++obj-$(CONFIG_XEN_GRANT_DEV)   += gntdev/
++obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL)               += sfc_netutil/
++obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND)   += sfc_netfront/
++obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND)    += sfc_netback/
diff --cc drivers/xen/balloon/Makefile

index 0000000,0000000..3fc3d0b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/balloon/Makefile
@@@ -1,0 -1,0 +1,2 @@@
++
++obj-y := balloon.o sysfs.o
diff --cc drivers/xen/balloon/balloon.c

index 0000000,0000000..4a809fd

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/balloon/balloon.c
@@@ -1,0 -1,0 +1,829 @@@
++/******************************************************************************
++ * balloon.c
++ *
++ * Xen balloon driver - enables returning/claiming memory to/from Xen.
++ *
++ * Copyright (c) 2003, B Dragovic
++ * Copyright (c) 2003-2004, M Williamson, K Fraser
++ * Copyright (c) 2005 Dan M. Smith, IBM Corporation
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/swap.h>
++#include <linux/mman.h>
++#include <linux/pagemap.h>
++#include <linux/bootmem.h>
++#include <linux/highmem.h>
++#include <linux/slab.h>
++#include <linux/mutex.h>
++#include <xen/xen_proc.h>
++#include <asm/hypervisor.h>
++#include <xen/balloon.h>
++#include <xen/interface/memory.h>
++#include <asm/maddr.h>
++#include <asm/page.h>
++#include <asm/pgalloc.h>
++#include <asm/pgtable.h>
++#include <asm/uaccess.h>
++#include <asm/tlb.h>
++#include <linux/highmem.h>
++#include <linux/list.h>
++#include <xen/xenbus.h>
++#include "common.h"
++
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
++#ifdef CONFIG_PROC_FS
++static struct proc_dir_entry *balloon_pde;
++#endif
++
++static DEFINE_MUTEX(balloon_mutex);
++
++/*
++ * Protects atomic reservation decrease/increase against concurrent increases.
++ * Also protects non-atomic updates of current_pages and driver_pages, and
++ * balloon lists.
++ */
++DEFINE_SPINLOCK(balloon_lock);
++
++#ifndef MODULE
++#include <linux/pagevec.h>
++static struct pagevec free_pagevec;
++#endif
++
++struct balloon_stats balloon_stats;
++
++/* We increase/decrease in batches which fit in a page */
++static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
++
++#ifdef CONFIG_HIGHMEM
++#define inc_totalhigh_pages() (totalhigh_pages++)
++#define dec_totalhigh_pages() (totalhigh_pages--)
++#else
++#define inc_totalhigh_pages() ((void)0)
++#define dec_totalhigh_pages() ((void)0)
++#endif
++
++#ifndef CONFIG_XEN
++/*
++ * In HVM guests accounting here uses the Xen visible values, but the kernel
++ * determined totalram_pages value shouldn't get altered. Since totalram_pages
++ * includes neither the kernel static image nor any memory allocated prior to
++ * or from the bootmem allocator, we have to synchronize the two values.
++ */
++static unsigned long __read_mostly totalram_bias;
++#else
++#define totalram_bias 0
++#endif
++
++/* List of ballooned pages, threaded through the mem_map array. */
++static LIST_HEAD(ballooned_pages);
++
++/* Main work function, always executed in process context. */
++static void balloon_process(struct work_struct *unused);
++static DECLARE_WORK(balloon_worker, balloon_process);
++
++/* When ballooning out (allocating memory to return to Xen) we don't really 
++   want the kernel to try too hard since that can trigger the oom killer. */
++#define GFP_BALLOON (GFP_HIGHUSER|__GFP_NOWARN|__GFP_NORETRY|__GFP_NOMEMALLOC|\
++                   __GFP_NOTRACK|__GFP_COLD)
++
++#define PAGE_TO_LIST(p) (&(p)->lru)
++#define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
++#define UNLIST_PAGE(p)                                \
++      do {                                    \
++              list_del(PAGE_TO_LIST(p));      \
++              PAGE_TO_LIST(p)->next = NULL;   \
++              PAGE_TO_LIST(p)->prev = NULL;   \
++      } while(0)
++
++#define IPRINTK(fmt, args...) pr_info("xen_mem: " fmt, ##args)
++#define WPRINTK(fmt, args...) pr_warning("xen_mem: " fmt, ##args)
++
++/* balloon_append: add the given page to the balloon. */
++static void balloon_append(struct page *page, int account)
++{
++      unsigned long pfn;
++
++      /* Lowmem is re-populated first, so highmem pages go at list tail. */
++      if (PageHighMem(page)) {
++              list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
++              bs.balloon_high++;
++              if (account)
++                      dec_totalhigh_pages();
++      } else {
++              list_add(PAGE_TO_LIST(page), &ballooned_pages);
++              bs.balloon_low++;
++      }
++
++      pfn = page_to_pfn(page);
++      if (account) {
++              SetPageReserved(page);
++              set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
++              page_zone(page)->present_pages--;
++      } else {
++              BUG_ON(!PageReserved(page));
++              WARN_ON_ONCE(phys_to_machine_mapping_valid(pfn));
++      }
++}
++
++/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
++static struct page *balloon_retrieve(int *was_empty)
++{
++      struct page *page;
++      struct zone *zone;
++
++      if (list_empty(&ballooned_pages))
++              return NULL;
++
++      page = LIST_TO_PAGE(ballooned_pages.next);
++      UNLIST_PAGE(page);
++      BUG_ON(!PageReserved(page));
++
++      if (PageHighMem(page)) {
++              bs.balloon_high--;
++              inc_totalhigh_pages();
++      }
++      else
++              bs.balloon_low--;
++      zone = page_zone(page);
++      *was_empty |= !populated_zone(zone);
++      zone->present_pages++;
++
++      return page;
++}
++
++static struct page *balloon_first_page(void)
++{
++      if (list_empty(&ballooned_pages))
++              return NULL;
++      return LIST_TO_PAGE(ballooned_pages.next);
++}
++
++static struct page *balloon_next_page(struct page *page)
++{
++      struct list_head *next = PAGE_TO_LIST(page)->next;
++      if (next == &ballooned_pages)
++              return NULL;
++      return LIST_TO_PAGE(next);
++}
++
++static inline void balloon_free_page(struct page *page)
++{
++#ifndef MODULE
++      if (put_page_testzero(page) && !pagevec_add(&free_pagevec, page)) {
++              __pagevec_free(&free_pagevec);
++              pagevec_reinit(&free_pagevec);
++      }
++#else
++      /* pagevec interface is not being exported. */
++      __free_page(page);
++#endif
++}
++
++static inline void balloon_free_and_unlock(unsigned long flags)
++{
++#ifndef MODULE
++      if (pagevec_count(&free_pagevec)) {
++              __pagevec_free(&free_pagevec);
++              pagevec_reinit(&free_pagevec);
++      }
++#endif
++      balloon_unlock(flags);
++}
++
++static void balloon_alarm(unsigned long unused)
++{
++      schedule_work(&balloon_worker);
++}
++static DEFINE_TIMER(balloon_timer, balloon_alarm, 0, 0);
++
++static unsigned long current_target(void)
++{
++      unsigned long target = bs.target_pages;
++      if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
++              target = bs.current_pages + bs.balloon_low + bs.balloon_high;
++      return target;
++}
++
++unsigned long balloon_minimum_target(void)
++{
++#ifndef CONFIG_XEN
++#define max_pfn num_physpages
++#endif
++      unsigned long min_pages, curr_pages = current_target();
++
++#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
++      /* Simple continuous piecewiese linear function:
++       *  max MiB -> min MiB  gradient
++       *       0         0
++       *      16        16
++       *      32        24
++       *     128        72    (1/2)
++       *     512       168    (1/4)
++       *    2048       360    (1/8)
++       *    8192       552    (1/32)
++       *   32768      1320
++       *  131072      4392
++       */
++      if (max_pfn < MB2PAGES(128))
++              min_pages = MB2PAGES(8) + (max_pfn >> 1);
++      else if (max_pfn < MB2PAGES(512))
++              min_pages = MB2PAGES(40) + (max_pfn >> 2);
++      else if (max_pfn < MB2PAGES(2048))
++              min_pages = MB2PAGES(104) + (max_pfn >> 3);
++      else
++              min_pages = MB2PAGES(296) + (max_pfn >> 5);
++#undef MB2PAGES
++
++      /* Don't enforce growth */
++      return min(min_pages, curr_pages);
++#ifndef CONFIG_XEN
++#undef max_pfn
++#endif
++}
++
++static int increase_reservation(unsigned long nr_pages)
++{
++      unsigned long  pfn, i, flags;
++      struct page   *page;
++      long           rc;
++      int            need_zonelists_rebuild = 0;
++      struct xen_memory_reservation reservation = {
++              .address_bits = 0,
++              .extent_order = 0,
++              .domid        = DOMID_SELF
++      };
++
++      if (nr_pages > ARRAY_SIZE(frame_list))
++              nr_pages = ARRAY_SIZE(frame_list);
++
++      balloon_lock(flags);
++
++      page = balloon_first_page();
++      for (i = 0; i < nr_pages; i++) {
++              BUG_ON(page == NULL);
++              frame_list[i] = page_to_pfn(page);;
++              page = balloon_next_page(page);
++      }
++
++      set_xen_guest_handle(reservation.extent_start, frame_list);
++      reservation.nr_extents = nr_pages;
++      rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
++      if (rc < 0)
++              goto out;
++
++      for (i = 0; i < rc; i++) {
++              page = balloon_retrieve(&need_zonelists_rebuild);
++              BUG_ON(page == NULL);
++
++              pfn = page_to_pfn(page);
++              BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
++                     phys_to_machine_mapping_valid(pfn));
++
++              set_phys_to_machine(pfn, frame_list[i]);
++
++#ifdef CONFIG_XEN
++              /* Link back into the page tables if not highmem. */
++              if (pfn < max_low_pfn) {
++                      int ret;
++                      ret = HYPERVISOR_update_va_mapping(
++                              (unsigned long)__va(pfn << PAGE_SHIFT),
++                              pfn_pte_ma(frame_list[i], PAGE_KERNEL),
++                              0);
++                      BUG_ON(ret);
++              }
++#endif
++
++              /* Relinquish the page back to the allocator. */
++              ClearPageReserved(page);
++              init_page_count(page);
++              balloon_free_page(page);
++      }
++
++      bs.current_pages += rc;
++      totalram_pages = bs.current_pages - totalram_bias;
++
++ out:
++      balloon_free_and_unlock(flags);
++
++#ifndef MODULE
++      setup_per_zone_wmarks();
++      if (rc > 0)
++              kswapd_run(0);
++      if (need_zonelists_rebuild)
++              build_all_zonelists(NULL);
++      else
++              vm_total_pages = nr_free_pagecache_pages();
++#endif
++
++      return rc < 0 ? rc : rc != nr_pages;
++}
++
++static int decrease_reservation(unsigned long nr_pages)
++{
++      unsigned long  pfn, i, flags;
++      struct page   *page;
++      void          *v;
++      int            need_sleep = 0;
++      int ret;
++      struct xen_memory_reservation reservation = {
++              .address_bits = 0,
++              .extent_order = 0,
++              .domid        = DOMID_SELF
++      };
++
++      if (nr_pages > ARRAY_SIZE(frame_list))
++              nr_pages = ARRAY_SIZE(frame_list);
++
++      for (i = 0; i < nr_pages; i++) {
++              if ((page = alloc_page(GFP_BALLOON)) == NULL) {
++                      nr_pages = i;
++                      need_sleep = 1;
++                      break;
++              }
++
++              pfn = page_to_pfn(page);
++              frame_list[i] = pfn_to_mfn(pfn);
++
++              if (!PageHighMem(page)) {
++                      v = phys_to_virt(pfn << PAGE_SHIFT);
++                      scrub_pages(v, 1);
++#ifdef CONFIG_XEN
++                      ret = HYPERVISOR_update_va_mapping(
++                              (unsigned long)v, __pte_ma(0), 0);
++                      BUG_ON(ret);
++#endif
++              }
++#ifdef CONFIG_XEN_SCRUB_PAGES
++              else {
++                      v = kmap(page);
++                      scrub_pages(v, 1);
++                      kunmap(page);
++              }
++#endif
++      }
++
++#ifdef CONFIG_XEN
++      /* Ensure that ballooned highmem pages don't have kmaps. */
++      kmap_flush_unused();
++      flush_tlb_all();
++#endif
++
++      balloon_lock(flags);
++
++      /* No more mappings: invalidate P2M and add to balloon. */
++      for (i = 0; i < nr_pages; i++) {
++              pfn = mfn_to_pfn(frame_list[i]);
++              balloon_append(pfn_to_page(pfn), 1);
++      }
++
++      set_xen_guest_handle(reservation.extent_start, frame_list);
++      reservation.nr_extents   = nr_pages;
++      ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
++      BUG_ON(ret != nr_pages);
++
++      bs.current_pages -= nr_pages;
++      totalram_pages = bs.current_pages - totalram_bias;
++
++      balloon_unlock(flags);
++
++      return need_sleep;
++}
++
++/*
++ * We avoid multiple worker processes conflicting via the balloon mutex.
++ * We may of course race updates of the target counts (which are protected
++ * by the balloon lock), or with changes to the Xen hard limit, but we will
++ * recover from these in time.
++ */
++static void balloon_process(struct work_struct *unused)
++{
++      int need_sleep = 0;
++      long credit;
++
++      mutex_lock(&balloon_mutex);
++
++      do {
++              credit = current_target() - bs.current_pages;
++              if (credit > 0)
++                      need_sleep = (increase_reservation(credit) != 0);
++              if (credit < 0)
++                      need_sleep = (decrease_reservation(-credit) != 0);
++
++#ifndef CONFIG_PREEMPT
++              if (need_resched())
++                      schedule();
++#endif
++      } while ((credit != 0) && !need_sleep);
++
++      /* Schedule more work if there is some still to be done. */
++      if (current_target() != bs.current_pages)
++              mod_timer(&balloon_timer, jiffies + HZ);
++
++      mutex_unlock(&balloon_mutex);
++}
++
++/* Resets the Xen limit, sets new target, and kicks off processing. */
++void balloon_set_new_target(unsigned long target)
++{
++      /* No need for lock. Not read-modify-write updates. */
++      bs.target_pages = max(target, balloon_minimum_target());
++      schedule_work(&balloon_worker);
++}
++
++static struct xenbus_watch target_watch =
++{
++      .node = "memory/target"
++};
++
++/* React to a change in the target key */
++static void watch_target(struct xenbus_watch *watch,
++                       const char **vec, unsigned int len)
++{
++      unsigned long long new_target;
++      int err;
++
++      err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
++      if (err != 1) {
++              /* This is ok (for domain0 at least) - so just return */
++              return;
++      }
++
++      /* The given memory/target value is in KiB, so it needs converting to
++       * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
++       */
++      balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
++}
++
++static int balloon_init_watcher(struct notifier_block *notifier,
++                              unsigned long event,
++                              void *data)
++{
++      int err;
++
++      err = register_xenbus_watch(&target_watch);
++      if (err)
++              pr_err("Failed to set balloon watcher\n");
++
++      return NOTIFY_DONE;
++}
++
++#ifdef CONFIG_PROC_FS
++static int balloon_write(struct file *file, const char __user *buffer,
++                       unsigned long count, void *data)
++{
++      char memstring[64], *endchar;
++      unsigned long long target_bytes;
++
++      if (!capable(CAP_SYS_ADMIN))
++              return -EPERM;
++
++      if (count <= 1)
++              return -EBADMSG; /* runt */
++      if (count > sizeof(memstring))
++              return -EFBIG;   /* too long */
++
++      if (copy_from_user(memstring, buffer, count))
++              return -EFAULT;
++      memstring[sizeof(memstring)-1] = '\0';
++
++      target_bytes = memparse(memstring, &endchar);
++      balloon_set_new_target(target_bytes >> PAGE_SHIFT);
++
++      return count;
++}
++
++static int balloon_read(char *page, char **start, off_t off,
++                      int count, int *eof, void *data)
++{
++      int len;
++
++      len = sprintf(
++              page,
++              "Current allocation: %8lu kB\n"
++              "Requested target:   %8lu kB\n"
++              "Minimum target:     %8lu kB\n"
++              "Maximum target:     %8lu kB\n"
++              "Low-mem balloon:    %8lu kB\n"
++              "High-mem balloon:   %8lu kB\n"
++              "Driver pages:       %8lu kB\n",
++              PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages), 
++              PAGES2KB(balloon_minimum_target()), PAGES2KB(num_physpages),
++              PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high),
++              PAGES2KB(bs.driver_pages));
++
++
++      *eof = 1;
++      return len;
++}
++#endif
++
++static struct notifier_block xenstore_notifier;
++
++static int __init balloon_init(void)
++{
++#if !defined(CONFIG_XEN)
++# ifndef XENMEM_get_pod_target
++#  define XENMEM_get_pod_target 17
++      typedef struct xen_pod_target {
++              uint64_t target_pages;
++              uint64_t tot_pages;
++              uint64_t pod_cache_pages;
++              uint64_t pod_entries;
++              domid_t domid;
++      } xen_pod_target_t;
++# endif
++      xen_pod_target_t pod_target = { .domid = DOMID_SELF };
++      int rc;
++#elif defined(CONFIG_X86)
++      unsigned long pfn;
++      struct page *page;
++#endif
++
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      IPRINTK("Initialising balloon driver.\n");
++
++#ifdef CONFIG_XEN
++      pagevec_init(&free_pagevec, true);
++      bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
++      totalram_pages   = bs.current_pages;
++#else 
++      rc = HYPERVISOR_memory_op(XENMEM_get_pod_target, &pod_target);
++      /*
++       * Xen prior to 3.4.0 masks the memory_op command to 4 bits, thus
++       * converting XENMEM_get_pod_target to XENMEM_decrease_reservation.
++       * Fortunately this results in a request with all input fields zero,
++       * but (due to the way bit 4 and upwards get interpreted) a starting
++       * extent of 1. When start_extent > nr_extents (>= in newer Xen), we
++       * simply get start_extent returned.
++       */
++      totalram_bias = HYPERVISOR_memory_op(rc != -ENOSYS && rc != 1
++              ? XENMEM_maximum_reservation : XENMEM_current_reservation,
++              &pod_target.domid);
++      if ((long)totalram_bias != -ENOSYS) {
++              BUG_ON(totalram_bias < totalram_pages);
++              bs.current_pages = totalram_bias;
++              totalram_bias -= totalram_pages;
++      } else {
++              totalram_bias = 0;
++              bs.current_pages = totalram_pages;
++      }
++#endif
++      bs.target_pages  = bs.current_pages;
++      bs.balloon_low   = 0;
++      bs.balloon_high  = 0;
++      bs.driver_pages  = 0UL;
++
++#ifdef CONFIG_PROC_FS
++      if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
++              WPRINTK("Unable to create /proc/xen/balloon.\n");
++              return -1;
++      }
++
++      balloon_pde->read_proc  = balloon_read;
++      balloon_pde->write_proc = balloon_write;
++#endif
++      balloon_sysfs_init();
++
++#if defined(CONFIG_X86) && defined(CONFIG_XEN) 
++      /* Initialise the balloon with excess memory space. */
++      for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
++              page = pfn_to_page(pfn);
++              if (!PageReserved(page)) {
++                      SetPageReserved(page);
++                      set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
++                      balloon_append(page, 0);
++              }
++      }
++#endif
++
++      target_watch.callback = watch_target;
++      xenstore_notifier.notifier_call = balloon_init_watcher;
++
++      register_xenstore_notifier(&xenstore_notifier);
++    
++      return 0;
++}
++
++subsys_initcall(balloon_init);
++
++static void __exit balloon_exit(void)
++{
++      balloon_sysfs_exit();
++      /* XXX - release balloon here */
++}
++
++module_exit(balloon_exit); 
++
++void balloon_update_driver_allowance(long delta)
++{
++      unsigned long flags;
++
++      balloon_lock(flags);
++      bs.driver_pages += delta;
++      balloon_unlock(flags);
++}
++EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
++
++#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
++
++#ifdef CONFIG_XEN
++static int dealloc_pte_fn(
++      pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
++{
++      unsigned long pfn, mfn = pte_mfn(*pte);
++      int ret;
++      struct xen_memory_reservation reservation = {
++              .nr_extents   = 1,
++              .extent_order = 0,
++              .domid        = DOMID_SELF
++      };
++      set_xen_guest_handle(reservation.extent_start, &mfn);
++      set_pte_at(&init_mm, addr, pte, __pte_ma(0));
++      pfn = __pa(addr) >> PAGE_SHIFT;
++      set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
++      SetPageReserved(pfn_to_page(pfn));
++      ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
++      BUG_ON(ret != 1);
++      return 0;
++}
++#endif
++
++struct page **alloc_empty_pages_and_pagevec(int nr_pages)
++{
++      unsigned long flags;
++      void *v;
++      struct page *page, **pagevec;
++      int i, ret;
++
++      pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
++      if (pagevec == NULL)
++              return NULL;
++
++      for (i = 0; i < nr_pages; i++) {
++              balloon_lock(flags);
++              page = balloon_first_page();
++              if (page && !PageHighMem(page)) {
++                      UNLIST_PAGE(page);
++                      bs.balloon_low--;
++                      balloon_unlock(flags);
++                      pagevec[i] = page;
++                      continue;
++              }
++              balloon_unlock(flags);
++
++              page = pagevec[i] = alloc_page(GFP_KERNEL|__GFP_NOTRACK|__GFP_COLD);
++              if (page == NULL)
++                      goto err;
++
++              v = page_address(page);
++              scrub_pages(v, 1);
++
++              balloon_lock(flags);
++
++              if (xen_feature(XENFEAT_auto_translated_physmap)) {
++                      unsigned long gmfn = page_to_pfn(page);
++                      struct xen_memory_reservation reservation = {
++                              .nr_extents   = 1,
++                              .extent_order = 0,
++                              .domid        = DOMID_SELF
++                      };
++                      set_xen_guest_handle(reservation.extent_start, &gmfn);
++                      ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
++                                                 &reservation);
++                      if (ret == 1)
++                              ret = 0; /* success */
++              } else {
++#ifdef CONFIG_XEN
++                      ret = apply_to_page_range(&init_mm, (unsigned long)v,
++                                                PAGE_SIZE, dealloc_pte_fn,
++                                                NULL);
++#else
++                      /* Cannot handle non-auto translate mode. */
++                      ret = 1;
++#endif
++              }
++
++              if (ret != 0) {
++                      balloon_free_page(page);
++                      balloon_free_and_unlock(flags);
++                      goto err;
++              }
++
++              totalram_pages = --bs.current_pages - totalram_bias;
++              if (PageHighMem(page))
++                      dec_totalhigh_pages();
++              page_zone(page)->present_pages--;
++
++              balloon_unlock(flags);
++      }
++
++ out:
++      schedule_work(&balloon_worker);
++#ifdef CONFIG_XEN
++      flush_tlb_all();
++#endif
++      return pagevec;
++
++ err:
++      balloon_lock(flags);
++      while (--i >= 0)
++              balloon_append(pagevec[i], 0);
++      balloon_unlock(flags);
++      kfree(pagevec);
++      pagevec = NULL;
++      goto out;
++}
++EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
++
++#endif /* CONFIG_XEN_BACKEND */
++
++#ifdef CONFIG_XEN
++static void _free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages,
++                                        bool free_vec)
++{
++      unsigned long flags;
++      int i;
++
++      if (pagevec == NULL)
++              return;
++
++      balloon_lock(flags);
++      for (i = 0; i < nr_pages; i++) {
++              BUG_ON(page_count(pagevec[i]) != 1);
++              balloon_append(pagevec[i], !free_vec);
++      }
++      if (!free_vec) {
++              bs.current_pages -= nr_pages;
++              totalram_pages = bs.current_pages - totalram_bias;
++      }
++      balloon_unlock(flags);
++
++      if (free_vec)
++              kfree(pagevec);
++
++      schedule_work(&balloon_worker);
++}
++
++void free_empty_pages(struct page **pagevec, int nr_pages)
++{
++      _free_empty_pages_and_pagevec(pagevec, nr_pages, false);
++}
++#endif
++
++#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
++{
++      _free_empty_pages_and_pagevec(pagevec, nr_pages, true);
++}
++EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
++#endif
++
++void balloon_release_driver_page(struct page *page)
++{
++      unsigned long flags;
++
++      balloon_lock(flags);
++      balloon_append(page, 1);
++      totalram_pages = --bs.current_pages - totalram_bias;
++      bs.driver_pages--;
++      balloon_unlock(flags);
++
++      schedule_work(&balloon_worker);
++}
++EXPORT_SYMBOL_GPL(balloon_release_driver_page);
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/balloon/common.h

index 0000000,0000000..0a53f7a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/balloon/common.h
@@@ -1,0 -1,0 +1,57 @@@
++/******************************************************************************
++ * balloon/common.h
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_BALLOON_COMMON_H__
++#define __XEN_BALLOON_COMMON_H__
++
++#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
++
++struct balloon_stats {
++      /* We aim for 'current allocation' == 'target allocation'. */
++      unsigned long current_pages;
++      unsigned long target_pages;
++      /*
++       * Drivers may alter the memory reservation independently, but they
++       * must inform the balloon driver so we avoid hitting the hard limit.
++       */
++      unsigned long driver_pages;
++      /* Number of pages in high- and low-memory balloons. */
++      unsigned long balloon_low;
++      unsigned long balloon_high;
++};
++
++extern struct balloon_stats balloon_stats;
++#define bs balloon_stats
++
++int balloon_sysfs_init(void);
++void balloon_sysfs_exit(void);
++
++void balloon_set_new_target(unsigned long target);
++unsigned long balloon_minimum_target(void);
++
++#endif /* __XEN_BALLOON_COMMON_H__ */
diff --cc drivers/xen/balloon/sysfs.c

index 0000000,0000000..24a9b2e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/balloon/sysfs.c
@@@ -1,0 -1,0 +1,204 @@@
++/******************************************************************************
++ * balloon/sysfs.c
++ *
++ * Xen balloon driver - sysfs interfaces.
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/capability.h>
++#include <linux/errno.h>
++#include <linux/init.h>
++#include <linux/mm.h>
++#include <linux/stat.h>
++#include <linux/string.h>
++#include <linux/sysdev.h>
++#include <linux/module.h>
++#include "common.h"
++
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
++#define BALLOON_CLASS_NAME "xen_memory"
++
++#define BALLOON_SHOW(name, format, args...)                   \
++      static ssize_t show_##name(struct sys_device *dev,      \
++                                 struct sysdev_attribute *attr, \
++                                 char *buf)                   \
++      {                                                       \
++              return sprintf(buf, format, ##args);            \
++      }                                                       \
++      static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL)
++
++BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages));
++BALLOON_SHOW(min_kb, "%lu\n", PAGES2KB(balloon_minimum_target()));
++BALLOON_SHOW(max_kb, "%lu\n", PAGES2KB(num_physpages));
++BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low));
++BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
++BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
++
++static ssize_t show_target_kb(struct sys_device *dev,
++                            struct sysdev_attribute *attr, char *buf)
++{
++      return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
++}
++
++static ssize_t store_target_kb(struct sys_device *dev,
++                             struct sysdev_attribute *attr,
++                             const char *buf, size_t count)
++{
++      char *endchar;
++      unsigned long long target_bytes;
++
++      if (!capable(CAP_SYS_ADMIN))
++              return -EPERM;
++      
++      if (count <= 1)
++              return -EBADMSG; /* runt */
++      
++      target_bytes = simple_strtoull(buf, &endchar, 0) << 10;
++      balloon_set_new_target(target_bytes >> PAGE_SHIFT);
++      
++      return count;
++}
++
++static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR,
++                 show_target_kb, store_target_kb);
++
++static ssize_t show_target(struct sys_device *dev,
++                         struct sysdev_attribute *attr, char *buf)
++{
++      return sprintf(buf, "%llu\n",
++                     (unsigned long long)balloon_stats.target_pages
++                     << PAGE_SHIFT);
++}
++
++static ssize_t store_target(struct sys_device *dev,
++                          struct sysdev_attribute *attr,
++                          const char *buf,
++                          size_t count)
++{
++      char *endchar;
++      unsigned long long target_bytes;
++
++      if (!capable(CAP_SYS_ADMIN))
++              return -EPERM;
++
++      if (count <= 1)
++              return -EBADMSG; /* runt */
++
++      target_bytes = memparse(buf, &endchar);
++      balloon_set_new_target(target_bytes >> PAGE_SHIFT);
++
++      return count;
++}
++
++static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR,
++                 show_target, store_target);
++
++static struct sysdev_attribute *balloon_attrs[] = {
++      &attr_target_kb,
++      &attr_target,
++};
++
++static struct attribute *balloon_info_attrs[] = {
++      &attr_current_kb.attr,
++      &attr_min_kb.attr,
++      &attr_max_kb.attr,
++      &attr_low_kb.attr,
++      &attr_high_kb.attr,
++      &attr_driver_kb.attr,
++      NULL
++};
++
++static struct attribute_group balloon_info_group = {
++      .name = "info",
++      .attrs = balloon_info_attrs,
++};
++
++static struct sysdev_class balloon_sysdev_class = {
++      .name = BALLOON_CLASS_NAME,
++};
++
++static struct sys_device balloon_sysdev;
++
++static int __init register_balloon(struct sys_device *sysdev)
++{
++      int i, error;
++
++      error = sysdev_class_register(&balloon_sysdev_class);
++      if (error)
++              return error;
++
++      sysdev->id = 0;
++      sysdev->cls = &balloon_sysdev_class;
++
++      error = sysdev_register(sysdev);
++      if (error) {
++              sysdev_class_unregister(&balloon_sysdev_class);
++              return error;
++      }
++
++      for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
++              error = sysdev_create_file(sysdev, balloon_attrs[i]);
++              if (error)
++                      goto fail;
++      }
++
++      error = sysfs_create_group(&sysdev->kobj, &balloon_info_group);
++      if (error)
++              goto fail;
++      
++      return 0;
++
++ fail:
++      while (--i >= 0)
++              sysdev_remove_file(sysdev, balloon_attrs[i]);
++      sysdev_unregister(sysdev);
++      sysdev_class_unregister(&balloon_sysdev_class);
++      return error;
++}
++
++static __exit void unregister_balloon(struct sys_device *sysdev)
++{
++      int i;
++
++      sysfs_remove_group(&sysdev->kobj, &balloon_info_group);
++      for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
++              sysdev_remove_file(sysdev, balloon_attrs[i]);
++      sysdev_unregister(sysdev);
++      sysdev_class_unregister(&balloon_sysdev_class);
++}
++
++int __init balloon_sysfs_init(void)
++{
++      return register_balloon(&balloon_sysdev);
++}
++
++void __exit balloon_sysfs_exit(void)
++{
++      unregister_balloon(&balloon_sysdev);
++}
diff --cc drivers/xen/blkback/Makefile

index 0000000,0000000..599afe4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkback/Makefile
@@@ -1,0 -1,0 +1,4 @@@
++obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
++obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o
++
++blkbk-y       := blkback.o xenbus.o interface.o vbd.o cdrom.o
diff --cc drivers/xen/blkback/blkback-pagemap.c

index 0000000,0000000..3a52ead

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkback/blkback-pagemap.c
@@@ -1,0 -1,0 +1,97 @@@
++#include <linux/module.h>
++#include <linux/slab.h>
++#include "blkback-pagemap.h"
++
++static int blkback_pagemap_size;
++static struct blkback_pagemap *blkback_pagemap;
++
++static inline int
++blkback_pagemap_entry_clear(struct blkback_pagemap *map)
++{
++      static struct blkback_pagemap zero;
++      return !memcmp(map, &zero, sizeof(zero));
++}
++
++int
++blkback_pagemap_init(int pages)
++{
++      blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap),
++                                GFP_KERNEL);
++      if (!blkback_pagemap)
++              return -ENOMEM;
++
++      blkback_pagemap_size = pages;
++      return 0;
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_init);
++
++void
++blkback_pagemap_set(int idx, struct page *page,
++                  domid_t domid, busid_t busid, grant_ref_t gref)
++{
++      struct blkback_pagemap *entry;
++
++      BUG_ON(!blkback_pagemap);
++      BUG_ON(idx >= blkback_pagemap_size);
++
++      SetPageBlkback(page);
++      set_page_private(page, idx);
++
++      entry = blkback_pagemap + idx;
++      if (!blkback_pagemap_entry_clear(entry)) {
++              pr_emerg("overwriting pagemap %d: d %u b %u g %u\n",
++                       idx, entry->domid, entry->busid, entry->gref);
++              BUG();
++      }
++
++      entry->domid = domid;
++      entry->busid = busid;
++      entry->gref  = gref;
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_set);
++
++void
++blkback_pagemap_clear(struct page *page)
++{
++      int idx;
++      struct blkback_pagemap *entry;
++
++      idx = (int)page_private(page);
++
++      BUG_ON(!blkback_pagemap);
++      BUG_ON(!PageBlkback(page));
++      BUG_ON(idx >= blkback_pagemap_size);
++
++      entry = blkback_pagemap + idx;
++      if (blkback_pagemap_entry_clear(entry)) {
++              pr_emerg("clearing empty pagemap %d\n", idx);
++              BUG();
++      }
++
++      memset(entry, 0, sizeof(*entry));
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_clear);
++
++struct blkback_pagemap
++blkback_pagemap_read(struct page *page)
++{
++      int idx;
++      struct blkback_pagemap *entry;
++
++      idx = (int)page_private(page);
++
++      BUG_ON(!blkback_pagemap);
++      BUG_ON(!PageBlkback(page));
++      BUG_ON(idx >= blkback_pagemap_size);
++
++      entry = blkback_pagemap + idx;
++      if (blkback_pagemap_entry_clear(entry)) {
++              pr_emerg("reading empty pagemap %d\n", idx);
++              BUG();
++      }
++
++      return *entry;
++}
++EXPORT_SYMBOL(blkback_pagemap_read);
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/blkback/blkback-pagemap.h

index 0000000,0000000..0becf22

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkback/blkback-pagemap.h
@@@ -1,0 -1,0 +1,37 @@@
++#ifndef _BLKBACK_PAGEMAP_H_
++#define _BLKBACK_PAGEMAP_H_
++
++#include <linux/mm.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/grant_table.h>
++
++typedef unsigned int busid_t;
++
++struct blkback_pagemap {
++      domid_t          domid;
++      busid_t          busid;
++      grant_ref_t      gref;
++};
++
++#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE)
++
++int blkback_pagemap_init(int);
++void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t);
++void blkback_pagemap_clear(struct page *);
++struct blkback_pagemap blkback_pagemap_read(struct page *);
++
++#else /* CONFIG_XEN_BLKBACK_PAGEMAP */
++
++static inline int blkback_pagemap_init(int pages) { return 0; }
++static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom,
++                                     busid_t bus, grant_ref_t gnt) {}
++static inline void blkback_pagemap_clear(struct page *page) {}
++static inline struct blkback_pagemap blkback_pagemap_read(struct page *page)
++{
++      BUG();
++      return (struct blkback_pagemap){-1, -1, -1};
++}
++
++#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */
++
++#endif
diff --cc drivers/xen/blkback/blkback.c

index 0000000,0000000..25b35be

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkback/blkback.c
@@@ -1,0 -1,0 +1,678 @@@
++/******************************************************************************
++ * arch/xen/drivers/blkif/backend/main.c
++ * 
++ * Back-end of the driver for virtual block devices. This portion of the
++ * driver exports a 'unified' block-device interface that can be accessed
++ * by any operating system that implements a compatible front end. A 
++ * reference front-end implementation can be found in:
++ *  arch/xen/drivers/blkif/frontend
++ * 
++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
++ * Copyright (c) 2005, Christopher Clark
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/spinlock.h>
++#include <linux/kthread.h>
++#include <linux/freezer.h>
++#include <linux/list.h>
++#include <linux/delay.h>
++#include <xen/balloon.h>
++#include <xen/evtchn.h>
++#include <xen/gnttab.h>
++#include <asm/hypervisor.h>
++#include "common.h"
++
++/*
++ * These are rather arbitrary. They are fairly large because adjacent requests
++ * pulled from a communication ring are quite likely to end up being part of
++ * the same scatter/gather request at the disc.
++ * 
++ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
++ * 
++ * This will increase the chances of being able to write whole tracks.
++ * 64 should be enough to keep us competitive with Linux.
++ */
++static int blkif_reqs = 64;
++module_param_named(reqs, blkif_reqs, int, 0);
++MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
++
++/* Run-time switchable: /sys/module/blkback/parameters/ */
++static unsigned int log_stats = 0;
++static unsigned int debug_lvl = 0;
++module_param(log_stats, int, 0644);
++module_param(debug_lvl, int, 0644);
++
++/*
++ * Each outstanding request that we've passed to the lower device layers has a 
++ * 'pending_req' allocated to it. Each buffer_head that completes decrements 
++ * the pendcnt towards zero. When it hits zero, the specified domain has a 
++ * response queued for it, with the saved 'id' passed back.
++ */
++typedef struct {
++      blkif_t       *blkif;
++      u64            id;
++      atomic_t       pendcnt;
++      unsigned short nr_pages;
++      unsigned short operation;
++      struct list_head free_list;
++} pending_req_t;
++
++static pending_req_t *pending_reqs;
++static struct list_head pending_free;
++static DEFINE_SPINLOCK(pending_free_lock);
++static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
++
++#define BLKBACK_INVALID_HANDLE (~0)
++
++static struct page **pending_pages;
++static grant_handle_t *pending_grant_handles;
++
++static inline int vaddr_pagenr(pending_req_t *req, int seg)
++{
++      return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
++}
++
++#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
++
++static inline unsigned long vaddr(pending_req_t *req, int seg)
++{
++      unsigned long pfn = page_to_pfn(pending_page(req, seg));
++      return (unsigned long)pfn_to_kaddr(pfn);
++}
++
++#define pending_handle(_req, _seg) \
++      (pending_grant_handles[vaddr_pagenr(_req, _seg)])
++
++
++static int do_block_io_op(blkif_t *blkif);
++static void dispatch_rw_block_io(blkif_t *blkif,
++                               blkif_request_t *req,
++                               pending_req_t *pending_req);
++static void make_response(blkif_t *blkif, u64 id,
++                        unsigned short op, int st);
++
++/******************************************************************
++ * misc small helpers
++ */
++static pending_req_t* alloc_req(void)
++{
++      pending_req_t *req = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&pending_free_lock, flags);
++      if (!list_empty(&pending_free)) {
++              req = list_entry(pending_free.next, pending_req_t, free_list);
++              list_del(&req->free_list);
++      }
++      spin_unlock_irqrestore(&pending_free_lock, flags);
++      return req;
++}
++
++static void free_req(pending_req_t *req)
++{
++      unsigned long flags;
++      int was_empty;
++
++      spin_lock_irqsave(&pending_free_lock, flags);
++      was_empty = list_empty(&pending_free);
++      list_add(&req->free_list, &pending_free);
++      spin_unlock_irqrestore(&pending_free_lock, flags);
++      if (was_empty)
++              wake_up(&pending_free_wq);
++}
++
++static void unplug_queue(blkif_t *blkif)
++{
++      if (blkif->plug == NULL)
++              return;
++      kobject_put(&blkif->plug->kobj);
++      blkif->plug = NULL;
++}
++
++static void plug_queue(blkif_t *blkif, struct block_device *bdev)
++{
++      struct request_queue *q = bdev_get_queue(bdev);
++
++      if (q == blkif->plug)
++              return;
++      unplug_queue(blkif);
++      WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
++      kobject_get(&q->kobj);
++      blkif->plug = q;
++}
++
++static void fast_flush_area(pending_req_t *req)
++{
++      struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++      unsigned int i, invcount = 0;
++      grant_handle_t handle;
++      int ret;
++
++      for (i = 0; i < req->nr_pages; i++) {
++              handle = pending_handle(req, i);
++              if (handle == BLKBACK_INVALID_HANDLE)
++                      continue;
++              blkback_pagemap_clear(pending_page(req, i));
++              gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
++                                  GNTMAP_host_map, handle);
++              pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
++              invcount++;
++      }
++
++      ret = HYPERVISOR_grant_table_op(
++              GNTTABOP_unmap_grant_ref, unmap, invcount);
++      BUG_ON(ret);
++}
++
++/******************************************************************
++ * SCHEDULER FUNCTIONS
++ */
++
++static void print_stats(blkif_t *blkif)
++{
++      printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d |  pk %4d\n",
++             current->comm, blkif->st_oo_req,
++             blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req,
++             blkif->st_pk_req);
++      blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
++      blkif->st_rd_req = 0;
++      blkif->st_wr_req = 0;
++      blkif->st_oo_req = 0;
++      blkif->st_pk_req = 0;
++}
++
++int blkif_schedule(void *arg)
++{
++      blkif_t *blkif = arg;
++      struct vbd *vbd = &blkif->vbd;
++
++      blkif_get(blkif);
++
++      if (debug_lvl)
++              printk(KERN_DEBUG "%s: started\n", current->comm);
++
++      while (!kthread_should_stop()) {
++              if (try_to_freeze())
++                      continue;
++              if (unlikely(vbd->size != vbd_size(vbd)))
++                      vbd_resize(blkif);
++
++              wait_event_interruptible(
++                      blkif->wq,
++                      blkif->waiting_reqs || kthread_should_stop());
++              wait_event_interruptible(
++                      pending_free_wq,
++                      !list_empty(&pending_free) || kthread_should_stop());
++
++              blkif->waiting_reqs = 0;
++              smp_mb(); /* clear flag *before* checking for work */
++
++              if (do_block_io_op(blkif))
++                      blkif->waiting_reqs = 1;
++              unplug_queue(blkif);
++
++              if (log_stats && time_after(jiffies, blkif->st_print))
++                      print_stats(blkif);
++      }
++
++      if (log_stats)
++              print_stats(blkif);
++      if (debug_lvl)
++              printk(KERN_DEBUG "%s: exiting\n", current->comm);
++
++      blkif->xenblkd = NULL;
++      blkif_put(blkif);
++
++      return 0;
++}
++
++/******************************************************************
++ * COMPLETION CALLBACK -- Called as bh->b_end_io()
++ */
++
++static void __end_block_io_op(pending_req_t *pending_req, int error)
++{
++      int status = BLKIF_RSP_OKAY;
++
++      /* An error fails the entire request. */
++      if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
++          (error == -EOPNOTSUPP)) {
++              DPRINTK("blkback: write barrier op failed, not supported\n");
++              blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
++              status = BLKIF_RSP_EOPNOTSUPP;
++      } else if (error) {
++              DPRINTK("Buffer not up-to-date at end of operation, "
++                      "error=%d\n", error);
++              status = BLKIF_RSP_ERROR;
++      }
++
++      if (atomic_dec_and_test(&pending_req->pendcnt)) {
++              fast_flush_area(pending_req);
++              make_response(pending_req->blkif, pending_req->id,
++                            pending_req->operation, status);
++              blkif_put(pending_req->blkif);
++              free_req(pending_req);
++      }
++}
++
++static void end_block_io_op(struct bio *bio, int error)
++{
++      __end_block_io_op(bio->bi_private, error);
++      bio_put(bio);
++}
++
++
++/******************************************************************************
++ * NOTIFICATION FROM GUEST OS.
++ */
++
++static void blkif_notify_work(blkif_t *blkif)
++{
++      blkif->waiting_reqs = 1;
++      wake_up(&blkif->wq);
++}
++
++irqreturn_t blkif_be_int(int irq, void *dev_id)
++{
++      blkif_notify_work(dev_id);
++      return IRQ_HANDLED;
++}
++
++
++
++/******************************************************************
++ * DOWNWARD CALLS -- These interface with the block-device layer proper.
++ */
++
++static int do_block_io_op(blkif_t *blkif)
++{
++      blkif_back_rings_t *blk_rings = &blkif->blk_rings;
++      blkif_request_t req;
++      pending_req_t *pending_req;
++      RING_IDX rc, rp;
++      int more_to_do = 0;
++
++      rc = blk_rings->common.req_cons;
++      rp = blk_rings->common.sring->req_prod;
++      rmb(); /* Ensure we see queued requests up to 'rp'. */
++
++      while ((rc != rp)) {
++
++              if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
++                      break;
++
++              if (kthread_should_stop()) {
++                      more_to_do = 1;
++                      break;
++              }
++
++              pending_req = alloc_req();
++              if (NULL == pending_req) {
++                      blkif->st_oo_req++;
++                      more_to_do = 1;
++                      break;
++              }
++
++              switch (blkif->blk_protocol) {
++              case BLKIF_PROTOCOL_NATIVE:
++                      memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
++                      break;
++              case BLKIF_PROTOCOL_X86_32:
++                      blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
++                      break;
++              case BLKIF_PROTOCOL_X86_64:
++                      blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
++                      break;
++              default:
++                      BUG();
++              }
++              blk_rings->common.req_cons = ++rc; /* before make_response() */
++
++              /* Apply all sanity checks to /private copy/ of request. */
++              barrier();
++
++              switch (req.operation) {
++              case BLKIF_OP_READ:
++                      blkif->st_rd_req++;
++                      dispatch_rw_block_io(blkif, &req, pending_req);
++                      break;
++              case BLKIF_OP_WRITE_BARRIER:
++                      blkif->st_br_req++;
++                      /* fall through */
++              case BLKIF_OP_WRITE:
++                      blkif->st_wr_req++;
++                      dispatch_rw_block_io(blkif, &req, pending_req);
++                      break;
++              case BLKIF_OP_PACKET:
++                      DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n");
++                      blkif->st_pk_req++;
++                      make_response(blkif, req.id, req.operation,
++                                    BLKIF_RSP_ERROR);
++                      free_req(pending_req);
++                      break;
++              default:
++                      /* A good sign something is wrong: sleep for a while to
++                       * avoid excessive CPU consumption by a bad guest. */
++                      msleep(1);
++                      DPRINTK("error: unknown block io operation [%d]\n",
++                              req.operation);
++                      make_response(blkif, req.id, req.operation,
++                                    BLKIF_RSP_ERROR);
++                      free_req(pending_req);
++                      break;
++              }
++
++              /* Yield point for this unbounded loop. */
++              cond_resched();
++      }
++
++      return more_to_do;
++}
++
++static void dispatch_rw_block_io(blkif_t *blkif,
++                               blkif_request_t *req,
++                               pending_req_t *pending_req)
++{
++      struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++      struct phys_req preq;
++      struct { 
++              unsigned long buf; unsigned int nsec;
++      } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++      unsigned int nseg;
++      struct bio *bio = NULL;
++      uint32_t flags;
++      int ret, i;
++      int operation;
++
++      switch (req->operation) {
++      case BLKIF_OP_READ:
++              operation = READ;
++              break;
++      case BLKIF_OP_WRITE:
++              operation = WRITE;
++              break;
++      case BLKIF_OP_WRITE_BARRIER:
++              operation = WRITE_FLUSH_FUA;
++              break;
++      default:
++              operation = 0; /* make gcc happy */
++              BUG();
++      }
++
++      /* Check that number of segments is sane. */
++      nseg = req->nr_segments;
++      if (unlikely(nseg == 0 && req->operation != BLKIF_OP_WRITE_BARRIER) ||
++          unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
++              DPRINTK("Bad number of segments in request (%d)\n", nseg);
++              goto fail_response;
++      }
++
++      preq.dev           = req->handle;
++      preq.sector_number = req->sector_number;
++      preq.nr_sects      = 0;
++
++      pending_req->blkif     = blkif;
++      pending_req->id        = req->id;
++      pending_req->operation = req->operation;
++      pending_req->nr_pages  = nseg;
++
++      flags = GNTMAP_host_map;
++      if (operation != READ)
++              flags |= GNTMAP_readonly;
++
++      for (i = 0; i < nseg; i++) {
++              seg[i].nsec = req->seg[i].last_sect -
++                      req->seg[i].first_sect + 1;
++
++              if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
++                  (req->seg[i].last_sect < req->seg[i].first_sect))
++                      goto fail_response;
++              preq.nr_sects += seg[i].nsec;
++
++              gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
++                                req->seg[i].gref, blkif->domid);
++      }
++
++      ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
++      BUG_ON(ret);
++
++      for (i = 0; i < nseg; i++) {
++              if (unlikely(map[i].status == GNTST_eagain))
++                      gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map[i])
++              if (unlikely(map[i].status != GNTST_okay)) {
++                      DPRINTK("invalid buffer -- could not remap it\n");
++                      map[i].handle = BLKBACK_INVALID_HANDLE;
++                      ret = 1;
++              } else {
++                      blkback_pagemap_set(vaddr_pagenr(pending_req, i),
++                                          pending_page(pending_req, i),
++                                          blkif->domid, req->handle,
++                                          req->seg[i].gref);
++              }
++
++              pending_handle(pending_req, i) = map[i].handle;
++
++              if (ret)
++                      continue;
++
++              set_phys_to_machine(
++                      page_to_pfn(pending_page(pending_req, i)),
++                      FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
++              seg[i].buf  = map[i].dev_bus_addr | 
++                      (req->seg[i].first_sect << 9);
++      }
++
++      if (ret)
++              goto fail_flush;
++
++      if (vbd_translate(&preq, blkif, operation) != 0) {
++              DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
++                      operation == READ ? "read" : "write",
++                      preq.sector_number,
++                      preq.sector_number + preq.nr_sects, preq.dev);
++              goto fail_flush;
++      }
++
++      plug_queue(blkif, preq.bdev);
++      atomic_set(&pending_req->pendcnt, 1);
++      blkif_get(blkif);
++
++      for (i = 0; i < nseg; i++) {
++              if (((int)preq.sector_number|(int)seg[i].nsec) &
++                  ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
++                      DPRINTK("Misaligned I/O request from domain %d",
++                              blkif->domid);
++                      goto fail_put_bio;
++              }
++
++              while ((bio == NULL) ||
++                     (bio_add_page(bio,
++                                   pending_page(pending_req, i),
++                                   seg[i].nsec << 9,
++                                   seg[i].buf & ~PAGE_MASK) == 0)) {
++                      if (bio) {
++                              atomic_inc(&pending_req->pendcnt);
++                              submit_bio(operation, bio);
++                      }
++
++                      bio = bio_alloc(GFP_KERNEL, nseg-i);
++                      if (unlikely(bio == NULL))
++                              goto fail_put_bio;
++
++                      bio->bi_bdev    = preq.bdev;
++                      bio->bi_private = pending_req;
++                      bio->bi_end_io  = end_block_io_op;
++                      bio->bi_sector  = preq.sector_number;
++              }
++
++              preq.sector_number += seg[i].nsec;
++      }
++
++      if (!bio) {
++              BUG_ON(!(operation & (REQ_FLUSH|REQ_FUA)));
++              bio = bio_alloc(GFP_KERNEL, 0);
++              if (unlikely(bio == NULL))
++                      goto fail_put_bio;
++
++              bio->bi_bdev    = preq.bdev;
++              bio->bi_private = pending_req;
++              bio->bi_end_io  = end_block_io_op;
++              bio->bi_sector  = -1;
++      }
++
++      submit_bio(operation, bio);
++
++      if (operation == READ)
++              blkif->st_rd_sect += preq.nr_sects;
++      else
++              blkif->st_wr_sect += preq.nr_sects;
++
++      return;
++
++ fail_flush:
++      fast_flush_area(pending_req);
++ fail_response:
++      make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
++      free_req(pending_req);
++      msleep(1); /* back off a bit */
++      return;
++
++ fail_put_bio:
++      __end_block_io_op(pending_req, -EINVAL);
++      if (bio)
++              bio_put(bio);
++      unplug_queue(blkif);
++      msleep(1); /* back off a bit */
++      return;
++}
++
++
++
++/******************************************************************
++ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
++ */
++
++
++static void make_response(blkif_t *blkif, u64 id,
++                        unsigned short op, int st)
++{
++      blkif_response_t  resp;
++      unsigned long     flags;
++      blkif_back_rings_t *blk_rings = &blkif->blk_rings;
++      int more_to_do = 0;
++      int notify;
++
++      resp.id        = id;
++      resp.operation = op;
++      resp.status    = st;
++
++      spin_lock_irqsave(&blkif->blk_ring_lock, flags);
++      /* Place on the response ring for the relevant domain. */
++      switch (blkif->blk_protocol) {
++      case BLKIF_PROTOCOL_NATIVE:
++              memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
++                     &resp, sizeof(resp));
++              break;
++      case BLKIF_PROTOCOL_X86_32:
++              memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
++                     &resp, sizeof(resp));
++              break;
++      case BLKIF_PROTOCOL_X86_64:
++              memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
++                     &resp, sizeof(resp));
++              break;
++      default:
++              BUG();
++      }
++      blk_rings->common.rsp_prod_pvt++;
++      RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
++      if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
++              /*
++               * Tail check for pending requests. Allows frontend to avoid
++               * notifications if requests are already in flight (lower
++               * overheads and promotes batching).
++               */
++              RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
++
++      } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
++              more_to_do = 1;
++      }
++
++      spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
++
++      if (more_to_do)
++              blkif_notify_work(blkif);
++      if (notify)
++              notify_remote_via_irq(blkif->irq);
++}
++
++static int __init blkif_init(void)
++{
++      int i, mmap_pages;
++
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
++
++      pending_reqs          = kzalloc(sizeof(pending_reqs[0]) *
++                                      blkif_reqs, GFP_KERNEL);
++      pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
++                                      mmap_pages, GFP_KERNEL);
++      pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
++
++      if (blkback_pagemap_init(mmap_pages))
++              goto out_of_memory;
++
++      if (!pending_reqs || !pending_grant_handles || !pending_pages)
++              goto out_of_memory;
++
++      for (i = 0; i < mmap_pages; i++)
++              pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
++
++      blkif_interface_init();
++
++      INIT_LIST_HEAD(&pending_free);
++
++      for (i = 0; i < blkif_reqs; i++)
++              list_add_tail(&pending_reqs[i].free_list, &pending_free);
++
++      blkif_xenbus_init();
++
++      return 0;
++
++ out_of_memory:
++      kfree(pending_reqs);
++      kfree(pending_grant_handles);
++      free_empty_pages_and_pagevec(pending_pages, mmap_pages);
++      pr_warning("%s: out of memory\n", __FUNCTION__);
++      return -ENOMEM;
++}
++
++module_init(blkif_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/blkback/cdrom.c

index 0000000,0000000..9679007

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkback/cdrom.c
@@@ -1,0 -1,0 +1,162 @@@
++/******************************************************************************
++ * blkback/cdrom.c
++ *
++ * Routines for managing cdrom watch and media-present attribute of a
++ * cdrom type virtual block device (VBD).
++ *
++ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
++ * Copyright (c) 2007       Pat Campbell
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++
++#undef DPRINTK
++#define DPRINTK(_f, _a...) \
++      printk(KERN_DEBUG "(%s() file=%s, line=%d) " _f "\n", \
++             __func__, __FILE__ , __LINE__ , ##_a )
++
++
++#define MEDIA_PRESENT "media-present"
++
++static void cdrom_media_changed(struct xenbus_watch *, const char **, unsigned int);
++
++/**
++ * Writes media-present=1 attribute for the given vbd device if not
++ * already there
++ */
++static int cdrom_xenstore_write_media_present(struct backend_info *be)
++{
++      struct xenbus_device *dev = be->dev;
++      struct xenbus_transaction xbt;
++      int err;
++      int media_present;
++
++      err = xenbus_scanf(XBT_NIL, dev->nodename, MEDIA_PRESENT, "%d",
++                         &media_present);
++      if (0 < err) {
++              DPRINTK("already written err%d", err);
++              return(0);
++      }
++      media_present = 1;
++
++again:
++      err = xenbus_transaction_start(&xbt);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "starting transaction");
++              return(-1);
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, MEDIA_PRESENT, "%d", media_present );
++      if (err) {
++              xenbus_dev_fatal(dev, err, "writing %s/%s",
++                       dev->nodename, MEDIA_PRESENT);
++              goto abort;
++      }
++      err = xenbus_transaction_end(xbt, 0);
++      if (err == -EAGAIN)
++              goto again;
++      if (err)
++              xenbus_dev_fatal(dev, err, "ending transaction");
++      return 0;
++ abort:
++      xenbus_transaction_end(xbt, 1);
++      return -1;
++}
++
++/**
++ *
++ */
++static int cdrom_is_type(struct backend_info *be)
++{
++      DPRINTK("type:%x", be->blkif->vbd.type );
++      return (be->blkif->vbd.type & VDISK_CDROM)
++             && (be->blkif->vbd.type & GENHD_FL_REMOVABLE);
++}
++
++/**
++ *
++ */
++void cdrom_add_media_watch(struct backend_info *be)
++{
++      struct xenbus_device *dev = be->dev;
++      int err;
++
++      DPRINTK("nodename:%s", dev->nodename);
++      if (cdrom_is_type(be)) {
++              DPRINTK("is a cdrom");
++              if ( cdrom_xenstore_write_media_present(be) == 0 ) {
++                      DPRINTK( "xenstore wrote OK");
++                      err = xenbus_watch_path2(dev, dev->nodename, MEDIA_PRESENT,
++                                               &be->cdrom_watch,
++                                               cdrom_media_changed);
++                      if (err)
++                              DPRINTK( "media_present watch add failed" );
++              }
++      }
++}
++
++/**
++ * Callback received when the "media_present" xenstore node is changed
++ */
++static void cdrom_media_changed(struct xenbus_watch *watch,
++                              const char **vec, unsigned int len)
++{
++      int err;
++      unsigned media_present;
++      struct backend_info *be
++              = container_of(watch, struct backend_info, cdrom_watch);
++      struct xenbus_device *dev = be->dev;
++
++      if (!cdrom_is_type(be)) {
++              DPRINTK("callback not for a cdrom" );
++              return;
++      }
++
++      err = xenbus_scanf(XBT_NIL, dev->nodename, MEDIA_PRESENT, "%d",
++                         &media_present);
++      if (err == 0 || err == -ENOENT) {
++              DPRINTK("xenbus_read of cdrom media_present node error:%d",err);
++              return;
++      }
++
++      if (media_present == 0)
++              vbd_free(&be->blkif->vbd);
++      else {
++              char *p = strrchr(dev->otherend, '/') + 1;
++              long handle = simple_strtoul(p, NULL, 0);
++
++              if (!be->blkif->vbd.bdev) {
++                      err = vbd_create(be->blkif, handle, be->major, be->minor,
++                                       !strchr(be->mode, 'w'), 1);
++                      if (err) {
++                              be->major = be->minor = 0;
++                              xenbus_dev_fatal(dev, err, "creating vbd structure");
++                              return;
++                      }
++              }
++      }
++}
diff --cc drivers/xen/blkback/common.h

index 0000000,0000000..7ac10df

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkback/common.h
@@@ -1,0 -1,0 +1,149 @@@
++/* 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __BLKIF__BACKEND__COMMON_H__
++#define __BLKIF__BACKEND__COMMON_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/blkdev.h>
++#include <linux/wait.h>
++#include <asm/hypervisor.h>
++#include <xen/blkif.h>
++#include <xen/xenbus.h>
++#include <xen/interface/event_channel.h>
++#include "blkback-pagemap.h"
++
++
++#define DPRINTK(_f, _a...)                    \
++      pr_debug("(file=%s, line=%d) " _f,      \
++               __FILE__ , __LINE__ , ## _a )
++
++struct vbd {
++      blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
++      unsigned char  readonly;    /* Non-zero -> read-only */
++      unsigned char  type;        /* VDISK_xxx */
++      u32            pdevice;     /* phys device that this vbd maps to */
++      struct block_device *bdev;
++      sector_t       size;        /* Cached size parameter */
++};
++
++struct backend_info;
++
++typedef struct blkif_st {
++      /* Unique identifier for this interface. */
++      domid_t           domid;
++      unsigned int      handle;
++      /* Physical parameters of the comms window. */
++      unsigned int      irq;
++      /* Comms information. */
++      enum blkif_protocol blk_protocol;
++      blkif_back_rings_t blk_rings;
++      struct vm_struct *blk_ring_area;
++      /* The VBD attached to this interface. */
++      struct vbd        vbd;
++      /* Back pointer to the backend_info. */
++      struct backend_info *be;
++      /* Private fields. */
++      spinlock_t       blk_ring_lock;
++      atomic_t         refcnt;
++
++      wait_queue_head_t   wq;
++      struct task_struct  *xenblkd;
++      unsigned int        waiting_reqs;
++      struct request_queue *plug;
++
++      /* statistics */
++      unsigned long       st_print;
++      int                 st_rd_req;
++      int                 st_wr_req;
++      int                 st_oo_req;
++      int                 st_br_req;
++      int                 st_pk_req;
++      int                 st_rd_sect;
++      int                 st_wr_sect;
++
++      wait_queue_head_t waiting_to_free;
++} blkif_t;
++
++struct backend_info
++{
++      struct xenbus_device *dev;
++      blkif_t *blkif;
++      struct xenbus_watch backend_watch;
++      struct xenbus_watch cdrom_watch;
++      unsigned major;
++      unsigned minor;
++      char *mode;
++};
++
++blkif_t *blkif_alloc(domid_t domid);
++void blkif_disconnect(blkif_t *blkif);
++void blkif_free(blkif_t *blkif);
++int blkif_map(blkif_t *blkif, grant_ref_t, evtchn_port_t);
++void vbd_resize(blkif_t *blkif);
++
++#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
++#define blkif_put(_b)                                 \
++      do {                                            \
++              if (atomic_dec_and_test(&(_b)->refcnt)) \
++                      wake_up(&(_b)->waiting_to_free);\
++      } while (0)
++
++/* Create a vbd. */
++int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
++             unsigned minor, int readonly, int cdrom);
++void vbd_free(struct vbd *vbd);
++
++unsigned long long vbd_size(struct vbd *vbd);
++unsigned int vbd_info(struct vbd *vbd);
++unsigned long vbd_secsize(struct vbd *vbd);
++
++struct phys_req {
++      unsigned short       dev;
++      unsigned short       nr_sects;
++      struct block_device *bdev;
++      blkif_sector_t       sector_number;
++};
++
++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
++
++void blkif_interface_init(void);
++
++void blkif_xenbus_init(void);
++
++irqreturn_t blkif_be_int(int irq, void *dev_id);
++int blkif_schedule(void *arg);
++
++int blkback_barrier(struct xenbus_transaction xbt,
++                  struct backend_info *be, int state);
++
++/* cdrom media change */
++void cdrom_add_media_watch(struct backend_info *be);
++
++#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --cc drivers/xen/blkback/interface.c

index 0000000,0000000..968899d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkback/interface.c
@@@ -1,0 -1,0 +1,144 @@@
++/******************************************************************************
++ * arch/xen/drivers/blkif/backend/interface.c
++ * 
++ * Block-device interface management.
++ * 
++ * Copyright (c) 2004, Keir Fraser
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++#include <xen/evtchn.h>
++#include <linux/kthread.h>
++#include <linux/vmalloc.h>
++
++static struct kmem_cache *blkif_cachep;
++
++blkif_t *blkif_alloc(domid_t domid)
++{
++      blkif_t *blkif;
++
++      blkif = kmem_cache_zalloc(blkif_cachep, GFP_KERNEL);
++      if (!blkif)
++              return ERR_PTR(-ENOMEM);
++
++      blkif->domid = domid;
++      spin_lock_init(&blkif->blk_ring_lock);
++      atomic_set(&blkif->refcnt, 1);
++      init_waitqueue_head(&blkif->wq);
++      blkif->st_print = jiffies;
++      init_waitqueue_head(&blkif->waiting_to_free);
++
++      return blkif;
++}
++
++int blkif_map(blkif_t *blkif, grant_ref_t ring_ref, evtchn_port_t evtchn)
++{
++      struct vm_struct *area;
++      int err;
++
++      /* Already connected through? */
++      if (blkif->irq)
++              return 0;
++
++      area = xenbus_map_ring_valloc(blkif->be->dev, ring_ref);
++      if (IS_ERR(area))
++              return PTR_ERR(area);
++      blkif->blk_ring_area = area;
++
++      switch (blkif->blk_protocol) {
++      case BLKIF_PROTOCOL_NATIVE:
++      {
++              blkif_sring_t *sring;
++              sring = (blkif_sring_t *)area->addr;
++              BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
++              break;
++      }
++      case BLKIF_PROTOCOL_X86_32:
++      {
++              blkif_x86_32_sring_t *sring_x86_32;
++              sring_x86_32 = (blkif_x86_32_sring_t *)area->addr;
++              BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
++              break;
++      }
++      case BLKIF_PROTOCOL_X86_64:
++      {
++              blkif_x86_64_sring_t *sring_x86_64;
++              sring_x86_64 = (blkif_x86_64_sring_t *)area->addr;
++              BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
++              break;
++      }
++      default:
++              BUG();
++      }
++
++      err = bind_interdomain_evtchn_to_irqhandler(
++              blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
++      if (err < 0)
++      {
++              xenbus_unmap_ring_vfree(blkif->be->dev, area);
++              blkif->blk_rings.common.sring = NULL;
++              return err;
++      }
++      blkif->irq = err;
++
++      return 0;
++}
++
++void blkif_disconnect(blkif_t *blkif)
++{
++      if (blkif->xenblkd) {
++              kthread_stop(blkif->xenblkd);
++              blkif->xenblkd = NULL;
++      }
++
++      atomic_dec(&blkif->refcnt);
++      wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
++      atomic_inc(&blkif->refcnt);
++
++      if (blkif->irq) {
++              unbind_from_irqhandler(blkif->irq, blkif);
++              blkif->irq = 0;
++      }
++
++      if (blkif->blk_rings.common.sring) {
++              xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring_area);
++              blkif->blk_rings.common.sring = NULL;
++      }
++}
++
++void blkif_free(blkif_t *blkif)
++{
++      if (!atomic_dec_and_test(&blkif->refcnt))
++              BUG();
++      kmem_cache_free(blkif_cachep, blkif);
++}
++
++void __init blkif_interface_init(void)
++{
++      blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
++                                       0, 0, NULL);
++}
diff --cc drivers/xen/blkback/vbd.c

index 0000000,0000000..195b260

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkback/vbd.c
@@@ -1,0 -1,0 +1,169 @@@
++/******************************************************************************
++ * blkback/vbd.c
++ * 
++ * Routines for managing virtual block devices (VBDs).
++ * 
++ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++
++#define vbd_sz(_v)   ((_v)->bdev->bd_part ?                           \
++      (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
++
++unsigned long long vbd_size(struct vbd *vbd)
++{
++      return vbd_sz(vbd);
++}
++
++unsigned int vbd_info(struct vbd *vbd)
++{
++      return vbd->type | (vbd->readonly?VDISK_READONLY:0);
++}
++
++unsigned long vbd_secsize(struct vbd *vbd)
++{
++      return bdev_logical_block_size(vbd->bdev);
++}
++
++int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
++             unsigned minor, int readonly, int cdrom)
++{
++      struct vbd *vbd;
++      struct block_device *bdev;
++
++      vbd = &blkif->vbd;
++      vbd->handle   = handle; 
++      vbd->readonly = readonly;
++      vbd->type     = 0;
++
++      vbd->pdevice  = MKDEV(major, minor);
++
++      bdev = blkdev_get_by_dev(vbd->pdevice,
++                               FMODE_READ | (vbd->readonly ? 0
++                                             : FMODE_WRITE | FMODE_EXCL),
++                               blkif);
++
++      if (IS_ERR(bdev)) {
++              DPRINTK("vbd_creat: device %08x could not be opened.\n",
++                      vbd->pdevice);
++              return -ENOENT;
++      }
++
++      vbd->bdev = bdev;
++      vbd->size = vbd_size(vbd);
++
++      if (vbd->bdev->bd_disk == NULL) {
++              DPRINTK("vbd_creat: device %08x doesn't exist.\n",
++                      vbd->pdevice);
++              vbd_free(vbd);
++              return -ENOENT;
++      }
++
++      if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
++              vbd->type |= VDISK_CDROM;
++      if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
++              vbd->type |= VDISK_REMOVABLE;
++
++      DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
++              handle, blkif->domid);
++      return 0;
++}
++
++void vbd_free(struct vbd *vbd)
++{
++      if (vbd->bdev)
++              blkdev_put(vbd->bdev,
++                         FMODE_READ | (vbd->readonly ? 0
++                                       : FMODE_WRITE | FMODE_EXCL));
++      vbd->bdev = NULL;
++}
++
++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
++{
++      struct vbd *vbd = &blkif->vbd;
++      int rc = -EACCES;
++
++      if ((operation != READ) && vbd->readonly)
++              goto out;
++
++      if (vbd->bdev == NULL)
++              goto out;
++
++      if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
++              goto out;
++
++      req->dev  = vbd->pdevice;
++      req->bdev = vbd->bdev;
++      rc = 0;
++
++ out:
++      return rc;
++}
++
++void vbd_resize(blkif_t *blkif)
++{
++      struct vbd *vbd = &blkif->vbd;
++      struct xenbus_transaction xbt;
++      int err;
++      struct xenbus_device *dev = blkif->be->dev;
++      unsigned long long new_size = vbd_size(vbd);
++
++      pr_info("VBD Resize: new size %Lu\n", new_size);
++      vbd->size = new_size;
++again:
++      err = xenbus_transaction_start(&xbt);
++      if (err) {
++              pr_warning("Error %d starting transaction", err);
++              return;
++      }
++      err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu",
++                          vbd_size(vbd));
++      if (err) {
++              pr_warning("Error %d writing new size", err);
++              goto abort;
++      }
++      /*
++       * Write the current state; we will use this to synchronize
++       * the front-end. If the current state is "connected" the
++       * front-end will get the new size information online.
++       */
++      err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
++      if (err) {
++              pr_warning("Error %d writing the state", err);
++              goto abort;
++      }
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err == -EAGAIN)
++              goto again;
++      if (err)
++              pr_warning("Error %d ending transaction", err);
++      return;
++abort:
++      xenbus_transaction_end(xbt, 1);
++}
diff --cc drivers/xen/blkback/xenbus.c

index 0000000,0000000..0225989

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkback/xenbus.c
@@@ -1,0 -1,0 +1,569 @@@
++/*  Xenbus code for blkif backend
++    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
++    Copyright (C) 2005 XenSource Ltd
++
++    This program is free software; you can redistribute it and/or modify
++    it under the terms of the GNU General Public License as published by
++    the Free Software Foundation; either version 2 of the License, or
++    (at your option) any later version.
++
++    This program is distributed in the hope that it will be useful,
++    but WITHOUT ANY WARRANTY; without even the implied warranty of
++    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++    GNU General Public License for more details.
++
++    You should have received a copy of the GNU General Public License
++    along with this program; if not, write to the Free Software
++    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++*/
++
++#include <stdarg.h>
++#include <linux/module.h>
++#include <linux/kthread.h>
++#include "common.h"
++#include "../core/domctl.h"
++
++#undef DPRINTK
++#define DPRINTK(fmt, args...)                         \
++      pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",   \
++               __FUNCTION__, __LINE__, ##args)
++
++static void connect(struct backend_info *);
++static int connect_ring(struct backend_info *);
++static void backend_changed(struct xenbus_watch *, const char **,
++                          unsigned int);
++
++static int blkback_name(blkif_t *blkif, char *buf)
++{
++      char *devpath, *devname;
++      struct xenbus_device *dev = blkif->be->dev;
++
++      devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
++      if (IS_ERR(devpath)) 
++              return PTR_ERR(devpath);
++      
++      if ((devname = strstr(devpath, "/dev/")) != NULL)
++              devname += strlen("/dev/");
++      else
++              devname  = devpath;
++
++      snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
++      kfree(devpath);
++      
++      return 0;
++}
++
++static void update_blkif_status(blkif_t *blkif)
++{ 
++      int err;
++      char name[TASK_COMM_LEN];
++
++      /* Not ready to connect? */
++      if (!blkif->irq || !blkif->vbd.bdev)
++              return;
++
++      /* Already connected? */
++      if (blkif->be->dev->state == XenbusStateConnected)
++              return;
++
++      /* Attempt to connect: exit if we fail to. */
++      connect(blkif->be);
++      if (blkif->be->dev->state != XenbusStateConnected)
++              return;
++
++      err = blkback_name(blkif, name);
++      if (err) {
++              xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
++              return;
++      }
++
++      err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
++      if (err) {
++              xenbus_dev_error(blkif->be->dev, err, "block flush");
++              return;
++      }
++      invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
++
++      blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
++      if (IS_ERR(blkif->xenblkd)) {
++              err = PTR_ERR(blkif->xenblkd);
++              blkif->xenblkd = NULL;
++              xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
++      }
++}
++
++
++/****************************************************************
++ *  sysfs interface for VBD I/O requests
++ */
++
++#define VBD_SHOW(name, format, args...)                                       \
++      static ssize_t show_##name(struct device *_dev,                 \
++                                 struct device_attribute *attr,       \
++                                 char *buf)                           \
++      {                                                               \
++              ssize_t ret = -ENODEV;                                  \
++              struct xenbus_device *dev;                              \
++              struct backend_info *be;                                \
++                                                                      \
++              if (!get_device(_dev))                                  \
++                      return ret;                                     \
++              dev = to_xenbus_device(_dev);                           \
++              if ((be = dev_get_drvdata(&dev->dev)) != NULL)          \
++                      ret = sprintf(buf, format, ##args);             \
++              put_device(_dev);                                       \
++              return ret;                                             \
++      }                                                               \
++      static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
++
++VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
++VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
++VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
++VBD_SHOW(br_req,  "%d\n", be->blkif->st_br_req);
++VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
++VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
++
++static struct attribute *vbdstat_attrs[] = {
++      &dev_attr_oo_req.attr,
++      &dev_attr_rd_req.attr,
++      &dev_attr_wr_req.attr,
++      &dev_attr_br_req.attr,
++      &dev_attr_rd_sect.attr,
++      &dev_attr_wr_sect.attr,
++      NULL
++};
++
++static struct attribute_group vbdstat_group = {
++      .name = "statistics",
++      .attrs = vbdstat_attrs,
++};
++
++VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
++VBD_SHOW(mode, "%s\n", be->mode);
++
++int xenvbd_sysfs_addif(struct xenbus_device *dev)
++{
++      int error;
++      
++      error = device_create_file(&dev->dev, &dev_attr_physical_device);
++      if (error)
++              goto fail1;
++
++      error = device_create_file(&dev->dev, &dev_attr_mode);
++      if (error)
++              goto fail2;
++
++      error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
++      if (error)
++              goto fail3;
++
++      return 0;
++
++fail3:        sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
++fail2:        device_remove_file(&dev->dev, &dev_attr_mode);
++fail1:        device_remove_file(&dev->dev, &dev_attr_physical_device);
++      return error;
++}
++
++void xenvbd_sysfs_delif(struct xenbus_device *dev)
++{
++      sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
++      device_remove_file(&dev->dev, &dev_attr_mode);
++      device_remove_file(&dev->dev, &dev_attr_physical_device);
++}
++
++static int blkback_remove(struct xenbus_device *dev)
++{
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++      DPRINTK("");
++
++      if (be->major || be->minor)
++              xenvbd_sysfs_delif(dev);
++
++      if (be->backend_watch.node) {
++              unregister_xenbus_watch(&be->backend_watch);
++              kfree(be->backend_watch.node);
++              be->backend_watch.node = NULL;
++      }
++
++      if (be->cdrom_watch.node) {
++              unregister_xenbus_watch(&be->cdrom_watch);
++              kfree(be->cdrom_watch.node);
++              be->cdrom_watch.node = NULL;
++      }
++
++      if (be->blkif) {
++              blkif_disconnect(be->blkif);
++              vbd_free(&be->blkif->vbd);
++              blkif_free(be->blkif);
++              be->blkif = NULL;
++      }
++
++      kfree(be);
++      dev_set_drvdata(&dev->dev, NULL);
++      return 0;
++}
++
++int blkback_barrier(struct xenbus_transaction xbt,
++                  struct backend_info *be, int state)
++{
++      struct xenbus_device *dev = be->dev;
++      int err;
++
++      err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
++                          "%d", state);
++      if (err)
++              xenbus_dev_fatal(dev, err, "writing feature-barrier");
++
++      return err;
++}
++
++/**
++ * Entry point to this code when a new device is created.  Allocate the basic
++ * structures, and watch the store waiting for the hotplug scripts to tell us
++ * the device's physical major and minor numbers.  Switch to InitWait.
++ */
++static int blkback_probe(struct xenbus_device *dev,
++                       const struct xenbus_device_id *id)
++{
++      int err;
++      struct backend_info *be = kzalloc(sizeof(struct backend_info),
++                                        GFP_KERNEL);
++      if (!be) {
++              xenbus_dev_fatal(dev, -ENOMEM,
++                               "allocating backend structure");
++              return -ENOMEM;
++      }
++      be->dev = dev;
++      dev_set_drvdata(&dev->dev, be);
++
++      be->blkif = blkif_alloc(dev->otherend_id);
++      if (IS_ERR(be->blkif)) {
++              err = PTR_ERR(be->blkif);
++              be->blkif = NULL;
++              xenbus_dev_fatal(dev, err, "creating block interface");
++              goto fail;
++      }
++
++      /* setup back pointer */
++      be->blkif->be = be;
++
++      err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
++                               &be->backend_watch, backend_changed);
++      if (err)
++              goto fail;
++
++      err = xenbus_switch_state(dev, XenbusStateInitWait);
++      if (err)
++              goto fail;
++
++      return 0;
++
++fail:
++      DPRINTK("failed");
++      blkback_remove(dev);
++      return err;
++}
++
++
++/**
++ * Callback received when the hotplug scripts have placed the physical-device
++ * node.  Read it and the mode node, and create a vbd.  If the frontend is
++ * ready, connect.
++ */
++static void backend_changed(struct xenbus_watch *watch,
++                          const char **vec, unsigned int len)
++{
++      int err;
++      unsigned major;
++      unsigned minor;
++      struct backend_info *be
++              = container_of(watch, struct backend_info, backend_watch);
++      struct xenbus_device *dev = be->dev;
++      int cdrom = 0;
++      char *device_type;
++
++      DPRINTK("");
++
++      err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
++                         &major, &minor);
++      if (XENBUS_EXIST_ERR(err)) {
++              /* Since this watch will fire once immediately after it is
++                 registered, we expect this.  Ignore it, and wait for the
++                 hotplug scripts. */
++              return;
++      }
++      if (err != 2) {
++              xenbus_dev_fatal(dev, err, "reading physical-device");
++              return;
++      }
++
++      if ((be->major || be->minor) &&
++          ((be->major != major) || (be->minor != minor))) {
++              printk(KERN_WARNING
++                     "blkback: changing physical device (from %x:%x to "
++                     "%x:%x) not supported.\n", be->major, be->minor,
++                     major, minor);
++              return;
++      }
++
++      be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
++      if (IS_ERR(be->mode)) {
++              err = PTR_ERR(be->mode);
++              be->mode = NULL;
++              xenbus_dev_fatal(dev, err, "reading mode");
++              return;
++      }
++
++      device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
++      if (!IS_ERR(device_type)) {
++              cdrom = strcmp(device_type, "cdrom") == 0;
++              kfree(device_type);
++      }
++
++      if (be->major == 0 && be->minor == 0) {
++              /* Front end dir is a number, which is used as the handle. */
++
++              char *p = strrchr(dev->otherend, '/') + 1;
++              long handle = simple_strtoul(p, NULL, 0);
++
++              be->major = major;
++              be->minor = minor;
++
++              err = vbd_create(be->blkif, handle, major, minor,
++                               (NULL == strchr(be->mode, 'w')), cdrom);
++              if (err) {
++                      be->major = be->minor = 0;
++                      xenbus_dev_fatal(dev, err, "creating vbd structure");
++                      return;
++              }
++
++              err = xenvbd_sysfs_addif(dev);
++              if (err) {
++                      vbd_free(&be->blkif->vbd);
++                      be->major = be->minor = 0;
++                      xenbus_dev_fatal(dev, err, "creating sysfs entries");
++                      return;
++              }
++
++              /* We're potentially connected now */
++              update_blkif_status(be->blkif);
++
++              /* Add watch for cdrom media status if necessay */
++              cdrom_add_media_watch(be);
++      }
++}
++
++
++/**
++ * Callback received when the frontend's state changes.
++ */
++static void frontend_changed(struct xenbus_device *dev,
++                           enum xenbus_state frontend_state)
++{
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++      int err;
++
++      DPRINTK("%s", xenbus_strstate(frontend_state));
++
++      switch (frontend_state) {
++      case XenbusStateInitialising:
++              if (dev->state == XenbusStateClosed) {
++                      printk(KERN_INFO "%s: %s: prepare for reconnect\n",
++                             __FUNCTION__, dev->nodename);
++                      xenbus_switch_state(dev, XenbusStateInitWait);
++              }
++              break;
++
++      case XenbusStateInitialised:
++      case XenbusStateConnected:
++              /* Ensure we connect even when two watches fire in 
++                 close successsion and we miss the intermediate value 
++                 of frontend_state. */
++              if (dev->state == XenbusStateConnected)
++                      break;
++
++              /* Enforce precondition before potential leak point.
++               * blkif_disconnect() is idempotent.
++               */
++              blkif_disconnect(be->blkif);
++
++              err = connect_ring(be);
++              if (err)
++                      break;
++              update_blkif_status(be->blkif);
++              break;
++
++      case XenbusStateClosing:
++              blkif_disconnect(be->blkif);
++              xenbus_switch_state(dev, XenbusStateClosing);
++              break;
++
++      case XenbusStateClosed:
++              xenbus_switch_state(dev, XenbusStateClosed);
++              if (xenbus_dev_is_online(dev))
++                      break;
++              /* fall through if not online */
++      case XenbusStateUnknown:
++              /* implies blkif_disconnect() via blkback_remove() */
++              device_unregister(&dev->dev);
++              break;
++
++      default:
++              xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++                               frontend_state);
++              break;
++      }
++}
++
++
++/* ** Connection ** */
++
++
++/**
++ * Write the physical details regarding the block device to the store, and
++ * switch to Connected state.
++ */
++static void connect(struct backend_info *be)
++{
++      struct xenbus_transaction xbt;
++      int err;
++      struct xenbus_device *dev = be->dev;
++
++      DPRINTK("%s", dev->otherend);
++
++      /* Supply the information about the device the frontend needs */
++again:
++      err = xenbus_transaction_start(&xbt);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "starting transaction");
++              return;
++      }
++
++      err = blkback_barrier(xbt, be, 1);
++      if (err)
++              goto abort;
++
++      err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
++                          vbd_size(&be->blkif->vbd));
++      if (err) {
++              xenbus_dev_fatal(dev, err, "writing %s/sectors",
++                               dev->nodename);
++              goto abort;
++      }
++
++      /* FIXME: use a typename instead */
++      err = xenbus_printf(xbt, dev->nodename, "info", "%u",
++                          vbd_info(&be->blkif->vbd));
++      if (err) {
++              xenbus_dev_fatal(dev, err, "writing %s/info",
++                               dev->nodename);
++              goto abort;
++      }
++      err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
++                          vbd_secsize(&be->blkif->vbd));
++      if (err) {
++              xenbus_dev_fatal(dev, err, "writing %s/sector-size",
++                               dev->nodename);
++              goto abort;
++      }
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err == -EAGAIN)
++              goto again;
++      if (err)
++              xenbus_dev_fatal(dev, err, "ending transaction");
++
++      err = xenbus_switch_state(dev, XenbusStateConnected);
++      if (err)
++              xenbus_dev_fatal(dev, err, "switching to Connected state",
++                               dev->nodename);
++
++      return;
++ abort:
++      xenbus_transaction_end(xbt, 1);
++}
++
++
++static int connect_ring(struct backend_info *be)
++{
++      struct xenbus_device *dev = be->dev;
++      unsigned long ring_ref;
++      unsigned int evtchn;
++      char protocol[64] = "";
++      int err;
++
++      DPRINTK("%s", dev->otherend);
++
++      err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
++                          "event-channel", "%u", &evtchn, NULL);
++      if (err) {
++              xenbus_dev_fatal(dev, err,
++                               "reading %s/ring-ref and event-channel",
++                               dev->otherend);
++              return err;
++      }
++
++      be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
++      err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
++                          "%63s", protocol, NULL);
++      if (err) {
++              strcpy(protocol, "unspecified");
++              be->blkif->blk_protocol = xen_guest_blkif_protocol(be->blkif->domid);
++      }
++      else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
++              be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
++      else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
++              be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
++      else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
++              be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
++#if 1 /* maintain compatibility with early sles10-sp1 and paravirt netware betas */
++      else if (0 == strcmp(protocol, "1"))
++              be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
++      else if (0 == strcmp(protocol, "2"))
++              be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
++#endif
++      else {
++              xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
++              return -1;
++      }
++      printk(KERN_INFO
++             "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
++             ring_ref, evtchn, be->blkif->blk_protocol, protocol);
++
++      /* Map the shared frame, irq etc. */
++      err = blkif_map(be->blkif, ring_ref, evtchn);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
++                               ring_ref, evtchn);
++              return err;
++      }
++
++      return 0;
++}
++
++
++/* ** Driver Registration ** */
++
++
++static const struct xenbus_device_id blkback_ids[] = {
++      { "vbd" },
++      { "" }
++};
++
++
++static struct xenbus_driver blkback = {
++      .name = "vbd",
++      .ids = blkback_ids,
++      .probe = blkback_probe,
++      .remove = blkback_remove,
++      .otherend_changed = frontend_changed
++};
++
++
++void blkif_xenbus_init(void)
++{
++      if (xenbus_register_backend(&blkback))
++              BUG();
++}
diff --cc drivers/xen/blkfront/Makefile

index 0000000,0000000..1ca0bed

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkfront/Makefile
@@@ -1,0 -1,0 +1,5 @@@
++
++obj-$(CONFIG_XEN_BLKDEV_FRONTEND)     := xenblk.o
++
++xenblk-objs := blkfront.o vbd.o vcd.o
++
diff --cc drivers/xen/blkfront/blkfront.c

index 0000000,0000000..d2886ec

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkfront/blkfront.c
@@@ -1,0 -1,0 +1,1038 @@@
++/******************************************************************************
++ * blkfront.c
++ * 
++ * XenLinux virtual block-device driver.
++ * 
++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
++ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
++ * Copyright (c) 2004, Christian Limpach
++ * Copyright (c) 2004, Andrew Warfield
++ * Copyright (c) 2005, Christopher Clark
++ * Copyright (c) 2005, XenSource Ltd
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/version.h>
++#include "block.h"
++#include <linux/cdrom.h>
++#include <linux/sched.h>
++#include <linux/interrupt.h>
++#include <linux/scatterlist.h>
++#include <scsi/scsi.h>
++#include <xen/evtchn.h>
++#include <xen/xenbus.h>
++#include <xen/interface/grant_table.h>
++#include <xen/interface/io/protocols.h>
++#include <xen/gnttab.h>
++#include <asm/hypervisor.h>
++#include <asm/maddr.h>
++
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
++#define BLKIF_STATE_DISCONNECTED 0
++#define BLKIF_STATE_CONNECTED    1
++#define BLKIF_STATE_SUSPENDED    2
++
++#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
++    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
++#define GRANT_INVALID_REF     0
++
++static void connect(struct blkfront_info *);
++static void blkfront_closing(struct blkfront_info *);
++static int blkfront_remove(struct xenbus_device *);
++static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
++static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
++
++static void kick_pending_request_queues(struct blkfront_info *);
++
++static irqreturn_t blkif_int(int irq, void *dev_id);
++static void blkif_restart_queue(struct work_struct *arg);
++static int blkif_recover(struct blkfront_info *);
++static void blkif_completion(struct blk_shadow *);
++static void blkif_free(struct blkfront_info *, int);
++
++
++/**
++ * Entry point to this code when a new device is created.  Allocate the basic
++ * structures and the ring buffer for communication with the backend, and
++ * inform the backend of the appropriate details for those.  Switch to
++ * Initialised state.
++ */
++static int blkfront_probe(struct xenbus_device *dev,
++                        const struct xenbus_device_id *id)
++{
++      int err, vdevice, i;
++      struct blkfront_info *info;
++
++      /* FIXME: Use dynamic device id if this is not set. */
++      err = xenbus_scanf(XBT_NIL, dev->nodename,
++                         "virtual-device", "%i", &vdevice);
++      if (err != 1) {
++              /* go looking in the extended area instead */
++              err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
++                      "%i", &vdevice);
++              if (err != 1) {
++                      xenbus_dev_fatal(dev, err, "reading virtual-device");
++                      return err;
++              }
++      }
++
++      info = kzalloc(sizeof(*info), GFP_KERNEL);
++      if (!info) {
++              xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
++              return -ENOMEM;
++      }
++
++      info->xbdev = dev;
++      info->vdevice = vdevice;
++      info->connected = BLKIF_STATE_DISCONNECTED;
++      INIT_WORK(&info->work, blkif_restart_queue);
++
++      for (i = 0; i < BLK_RING_SIZE; i++)
++              info->shadow[i].req.id = i+1;
++      info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
++
++      /* Front end dir is a number, which is used as the id. */
++      info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
++      dev_set_drvdata(&dev->dev, info);
++
++      err = talk_to_backend(dev, info);
++      if (err) {
++              kfree(info);
++              dev_set_drvdata(&dev->dev, NULL);
++              return err;
++      }
++
++      return 0;
++}
++
++
++/**
++ * We are reconnecting to the backend, due to a suspend/resume, or a backend
++ * driver restart.  We tear down our blkif structure and recreate it, but
++ * leave the device-layer structures intact so that this is transparent to the
++ * rest of the kernel.
++ */
++static int blkfront_resume(struct xenbus_device *dev)
++{
++      struct blkfront_info *info = dev_get_drvdata(&dev->dev);
++      int err;
++
++      DPRINTK("blkfront_resume: %s\n", dev->nodename);
++
++      blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
++
++      err = talk_to_backend(dev, info);
++      if (info->connected == BLKIF_STATE_SUSPENDED && !err)
++              err = blkif_recover(info);
++
++      return err;
++}
++
++
++/* Common code used when first setting up, and when resuming. */
++static int talk_to_backend(struct xenbus_device *dev,
++                         struct blkfront_info *info)
++{
++      const char *message = NULL;
++      struct xenbus_transaction xbt;
++      int err;
++
++      /* Create shared ring, alloc event channel. */
++      err = setup_blkring(dev, info);
++      if (err)
++              goto out;
++
++again:
++      err = xenbus_transaction_start(&xbt);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "starting transaction");
++              goto destroy_blkring;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename,
++                          "ring-ref","%u", info->ring_ref);
++      if (err) {
++              message = "writing ring-ref";
++              goto abort_transaction;
++      }
++      err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
++                          irq_to_evtchn_port(info->irq));
++      if (err) {
++              message = "writing event-channel";
++              goto abort_transaction;
++      }
++      err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
++                          XEN_IO_PROTO_ABI_NATIVE);
++      if (err) {
++              message = "writing protocol";
++              goto abort_transaction;
++      }
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err) {
++              if (err == -EAGAIN)
++                      goto again;
++              xenbus_dev_fatal(dev, err, "completing transaction");
++              goto destroy_blkring;
++      }
++
++      xenbus_switch_state(dev, XenbusStateInitialised);
++
++      return 0;
++
++ abort_transaction:
++      xenbus_transaction_end(xbt, 1);
++      if (message)
++              xenbus_dev_fatal(dev, err, "%s", message);
++ destroy_blkring:
++      blkif_free(info, 0);
++ out:
++      return err;
++}
++
++
++static int setup_blkring(struct xenbus_device *dev,
++                       struct blkfront_info *info)
++{
++      blkif_sring_t *sring;
++      int err;
++
++      info->ring_ref = GRANT_INVALID_REF;
++
++      sring = (blkif_sring_t *)__get_free_page(GFP_NOIO | __GFP_HIGH);
++      if (!sring) {
++              xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
++              return -ENOMEM;
++      }
++      SHARED_RING_INIT(sring);
++      FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
++
++      sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++
++      err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
++      if (err < 0) {
++              free_page((unsigned long)sring);
++              info->ring.sring = NULL;
++              goto fail;
++      }
++      info->ring_ref = err;
++
++      err = bind_listening_port_to_irqhandler(
++              dev->otherend_id, blkif_int, IRQF_SAMPLE_RANDOM, "blkif", info);
++      if (err <= 0) {
++              xenbus_dev_fatal(dev, err,
++                               "bind_listening_port_to_irqhandler");
++              goto fail;
++      }
++      info->irq = err;
++
++      return 0;
++fail:
++      blkif_free(info, 0);
++      return err;
++}
++
++
++/**
++ * Callback received when the backend's state changes.
++ */
++static void backend_changed(struct xenbus_device *dev,
++                          enum xenbus_state backend_state)
++{
++      struct blkfront_info *info = dev_get_drvdata(&dev->dev);
++      struct block_device *bd;
++
++      DPRINTK("blkfront:backend_changed.\n");
++
++      switch (backend_state) {
++      case XenbusStateInitialising:
++      case XenbusStateInitWait:
++      case XenbusStateInitialised:
++      case XenbusStateReconfiguring:
++      case XenbusStateReconfigured:
++      case XenbusStateUnknown:
++      case XenbusStateClosed:
++              break;
++
++      case XenbusStateConnected:
++              connect(info);
++              break;
++
++      case XenbusStateClosing:
++              if (!info->gd) {
++                      xenbus_frontend_closed(dev);
++                      break;
++              }
++              bd = bdget_disk(info->gd, 0);
++              if (bd == NULL) {
++                      xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
++                      break;
++              }
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
++              down(&bd->bd_sem);
++#else
++              mutex_lock(&bd->bd_mutex);
++#endif
++              if (info->users > 0)
++                      xenbus_dev_error(dev, -EBUSY,
++                                       "Device in use; refusing to close");
++              else
++                      blkfront_closing(info);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
++              up(&bd->bd_sem);
++#else
++              mutex_unlock(&bd->bd_mutex);
++#endif
++              bdput(bd);
++              break;
++      }
++}
++
++
++/* ** Connection ** */
++
++
++/*
++ * Invoked when the backend is finally 'ready' (and has told produced
++ * the details about the physical device - #sectors, size, etc).
++ */
++static void connect(struct blkfront_info *info)
++{
++      unsigned long long sectors;
++      unsigned long sector_size;
++      unsigned int binfo;
++      int err, barrier;
++
++      switch (info->connected) {
++      case BLKIF_STATE_CONNECTED:
++              /*
++               * Potentially, the back-end may be signalling
++               * a capacity change; update the capacity.
++               */
++              err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
++                                 "sectors", "%Lu", &sectors);
++              if (XENBUS_EXIST_ERR(err))
++                      return;
++              pr_info("Setting capacity to %Lu\n", sectors);
++              set_capacity(info->gd, sectors);
++              revalidate_disk(info->gd);
++
++              /* fall through */
++      case BLKIF_STATE_SUSPENDED:
++              return;
++      }
++
++      DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
++
++      err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
++                          "sectors", "%Lu", &sectors,
++                          "info", "%u", &binfo,
++                          "sector-size", "%lu", &sector_size,
++                          NULL);
++      if (err) {
++              xenbus_dev_fatal(info->xbdev, err,
++                               "reading backend fields at %s",
++                               info->xbdev->otherend);
++              return;
++      }
++
++      err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
++                          "feature-barrier", "%d", &barrier,
++                          NULL);
++      /*
++       * If there's no "feature-barrier" defined, then it means
++       * we're dealing with a very old backend which writes
++       * synchronously; nothing to do.
++       *
++       * If there are barriers, then we use flush.
++       */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
++      if (!err && barrier)
++              info->feature_flush = REQ_FLUSH | REQ_FUA;
++      else
++              info->feature_flush = 0;
++#else
++      if (err)
++              info->feature_flush = QUEUE_ORDERED_DRAIN;
++      else if (barrier)
++              info->feature_flush = QUEUE_ORDERED_TAG;
++      else
++              info->feature_flush = QUEUE_ORDERED_NONE;
++#endif
++
++      err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
++      if (err) {
++              xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
++                               info->xbdev->otherend);
++              return;
++      }
++
++      err = xlvbd_sysfs_addif(info);
++      if (err) {
++              xenbus_dev_fatal(info->xbdev, err, "xlvbd_sysfs_addif at %s",
++                               info->xbdev->otherend);
++              return;
++      }
++
++      (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
++
++      /* Kick pending requests. */
++      spin_lock_irq(&blkif_io_lock);
++      info->connected = BLKIF_STATE_CONNECTED;
++      kick_pending_request_queues(info);
++      spin_unlock_irq(&blkif_io_lock);
++
++      add_disk(info->gd);
++
++      info->is_ready = 1;
++
++      register_vcd(info);
++}
++
++/**
++ * Handle the change of state of the backend to Closing.  We must delete our
++ * device-layer structures now, to ensure that writes are flushed through to
++ * the backend.  Once is this done, we can switch to Closed in
++ * acknowledgement.
++ */
++static void blkfront_closing(struct blkfront_info *info)
++{
++      unsigned long flags;
++
++      DPRINTK("blkfront_closing: %d removed\n", info->vdevice);
++
++      if (info->rq == NULL)
++              goto out;
++
++      spin_lock_irqsave(&blkif_io_lock, flags);
++      /* No more blkif_request(). */
++      blk_stop_queue(info->rq);
++      /* No more gnttab callback work. */
++      gnttab_cancel_free_callback(&info->callback);
++      spin_unlock_irqrestore(&blkif_io_lock, flags);
++
++      /* Flush gnttab callback work. Must be done with no locks held. */
++      flush_work_sync(&info->work);
++
++      xlvbd_sysfs_delif(info);
++
++      unregister_vcd(info);
++
++      xlvbd_del(info);
++
++ out:
++      if (info->xbdev)
++              xenbus_frontend_closed(info->xbdev);
++}
++
++
++static int blkfront_remove(struct xenbus_device *dev)
++{
++      struct blkfront_info *info = dev_get_drvdata(&dev->dev);
++
++      DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
++
++      blkif_free(info, 0);
++
++      if(info->users == 0)
++              kfree(info);
++      else
++              info->xbdev = NULL;
++
++      return 0;
++}
++
++
++static inline int GET_ID_FROM_FREELIST(
++      struct blkfront_info *info)
++{
++      unsigned long free = info->shadow_free;
++      BUG_ON(free >= BLK_RING_SIZE);
++      info->shadow_free = info->shadow[free].req.id;
++      info->shadow[free].req.id = 0x0fffffee; /* debug */
++      return free;
++}
++
++static inline void ADD_ID_TO_FREELIST(
++      struct blkfront_info *info, unsigned long id)
++{
++      info->shadow[id].req.id  = info->shadow_free;
++      info->shadow[id].request = NULL;
++      info->shadow_free = id;
++}
++
++static inline void flush_requests(struct blkfront_info *info)
++{
++      int notify;
++
++      RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
++
++      if (notify)
++              notify_remote_via_irq(info->irq);
++}
++
++static void kick_pending_request_queues(struct blkfront_info *info)
++{
++      if (!RING_FULL(&info->ring)) {
++              /* Re-enable calldowns. */
++              blk_start_queue(info->rq);
++              /* Kick things off immediately. */
++              do_blkif_request(info->rq);
++      }
++}
++
++static void blkif_restart_queue(struct work_struct *arg)
++{
++      struct blkfront_info *info = container_of(arg, struct blkfront_info, work);
++      spin_lock_irq(&blkif_io_lock);
++      if (info->connected == BLKIF_STATE_CONNECTED)
++              kick_pending_request_queues(info);
++      spin_unlock_irq(&blkif_io_lock);
++}
++
++static void blkif_restart_queue_callback(void *arg)
++{
++      struct blkfront_info *info = (struct blkfront_info *)arg;
++      schedule_work(&info->work);
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++int blkif_open(struct inode *inode, struct file *filep)
++{
++      struct block_device *bd = inode->i_bdev;
++#else
++int blkif_open(struct block_device *bd, fmode_t mode)
++{
++#endif
++      struct blkfront_info *info = bd->bd_disk->private_data;
++
++      if (!info->xbdev)
++              return -ENODEV;
++      info->users++;
++      return 0;
++}
++
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++int blkif_release(struct inode *inode, struct file *filep)
++{
++      struct gendisk *disk = inode->i_bdev->bd_disk;
++#else
++int blkif_release(struct gendisk *disk, fmode_t mode)
++{
++#endif
++      struct blkfront_info *info = disk->private_data;
++
++      info->users--;
++      if (info->users == 0) {
++              /* Check whether we have been instructed to close.  We will
++                 have ignored this request initially, as the device was
++                 still mounted. */
++              struct xenbus_device * dev = info->xbdev;
++
++              if (!dev) {
++                      blkfront_closing(info);
++                      kfree(info);
++              } else if (xenbus_read_driver_state(dev->otherend)
++                         == XenbusStateClosing && info->is_ready)
++                      blkfront_closing(info);
++      }
++      return 0;
++}
++
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++int blkif_ioctl(struct inode *inode, struct file *filep,
++              unsigned command, unsigned long argument)
++{
++      struct block_device *bd = inode->i_bdev;
++#else
++int blkif_ioctl(struct block_device *bd, fmode_t mode,
++              unsigned command, unsigned long argument)
++{
++#endif
++      struct blkfront_info *info = bd->bd_disk->private_data;
++      int i;
++
++      DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
++                    command, (long)argument, inode->i_rdev);
++
++      switch (command) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
++      case HDIO_GETGEO: {
++              struct hd_geometry geo;
++              int ret;
++
++                if (!argument)
++                        return -EINVAL;
++
++              geo.start = get_start_sect(bd);
++              ret = blkif_getgeo(bd, &geo);
++              if (ret)
++                      return ret;
++
++              if (copy_to_user((struct hd_geometry __user *)argument, &geo,
++                               sizeof(geo)))
++                        return -EFAULT;
++
++                return 0;
++      }
++#endif
++      case CDROMMULTISESSION:
++              DPRINTK("FIXME: support multisession CDs later\n");
++              for (i = 0; i < sizeof(struct cdrom_multisession); i++)
++                      if (put_user(0, (char __user *)(argument + i)))
++                              return -EFAULT;
++              return 0;
++
++      case CDROM_GET_CAPABILITY: {
++              struct gendisk *gd = info->gd;
++              if (gd->flags & GENHD_FL_CD)
++                      return 0;
++              return -EINVAL;
++      }
++      default:
++              if (info->mi && info->gd && info->rq) {
++                      switch (info->mi->major) {
++                      case SCSI_DISK0_MAJOR:
++                      case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
++                      case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR:
++                      case SCSI_CDROM_MAJOR:
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
++                              return scsi_cmd_ioctl(filep, info->gd, command,
++                                                    (void __user *)argument);
++#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++                              return scsi_cmd_ioctl(filep, info->rq,
++                                                    info->gd, command,
++                                                    (void __user *)argument);
++#else
++                              return scsi_cmd_ioctl(info->rq, info->gd,
++                                                    mode, command,
++                                                    (void __user *)argument);
++#endif
++                      }
++              }
++
++              return -EINVAL; /* same return as native Linux */
++      }
++
++      return 0;
++}
++
++
++int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
++{
++      /* We don't have real geometry info, but let's at least return
++         values consistent with the size of the device */
++      sector_t nsect = get_capacity(bd->bd_disk);
++      sector_t cylinders = nsect;
++
++      hg->heads = 0xff;
++      hg->sectors = 0x3f;
++      sector_div(cylinders, hg->heads * hg->sectors);
++      hg->cylinders = cylinders;
++      if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
++              hg->cylinders = 0xffff;
++      return 0;
++}
++
++
++/*
++ * Generate a Xen blkfront IO request from a blk layer request.  Reads
++ * and writes are handled as expected.  Since we lack a loose flush
++ * request, we map flushes into a full ordered barrier.
++ *
++ * @req: a request struct
++ */
++static int blkif_queue_request(struct request *req)
++{
++      struct blkfront_info *info = req->rq_disk->private_data;
++      unsigned long buffer_mfn;
++      blkif_request_t *ring_req;
++      unsigned long id;
++      unsigned int fsect, lsect;
++      int i, ref;
++      grant_ref_t gref_head;
++      struct scatterlist *sg;
++
++      if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
++              return 1;
++
++      if (gnttab_alloc_grant_references(
++              BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
++              gnttab_request_free_callback(
++                      &info->callback,
++                      blkif_restart_queue_callback,
++                      info,
++                      BLKIF_MAX_SEGMENTS_PER_REQUEST);
++              return 1;
++      }
++
++      /* Fill out a communications ring structure. */
++      ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
++      id = GET_ID_FROM_FREELIST(info);
++      info->shadow[id].request = req;
++
++      ring_req->id = id;
++      ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
++      ring_req->handle = info->handle;
++
++      ring_req->operation = rq_data_dir(req) ?
++              BLKIF_OP_WRITE : BLKIF_OP_READ;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
++      if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
++#else
++      if (req->cmd_flags & REQ_HARDBARRIER)
++#endif
++              ring_req->operation = BLKIF_OP_WRITE_BARRIER;
++      if (req->cmd_type == REQ_TYPE_BLOCK_PC)
++              ring_req->operation = BLKIF_OP_PACKET;
++
++      ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
++      BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
++      for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
++                      buffer_mfn = page_to_phys(sg_page(sg)) >> PAGE_SHIFT;
++                      fsect = sg->offset >> 9;
++                      lsect = fsect + (sg->length >> 9) - 1;
++                      /* install a grant reference. */
++                      ref = gnttab_claim_grant_reference(&gref_head);
++                      BUG_ON(ref == -ENOSPC);
++
++                      gnttab_grant_foreign_access_ref(
++                              ref,
++                              info->xbdev->otherend_id,
++                              buffer_mfn,
++                              rq_data_dir(req) ? GTF_readonly : 0 );
++
++                      info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
++                      ring_req->seg[i] =
++                              (struct blkif_request_segment) {
++                                      .gref       = ref,
++                                      .first_sect = fsect,
++                                      .last_sect  = lsect };
++      }
++
++      info->ring.req_prod_pvt++;
++
++      /* Keep a private copy so we can reissue requests when recovering. */
++      info->shadow[id].req = *ring_req;
++
++      gnttab_free_grant_references(gref_head);
++
++      return 0;
++}
++
++/*
++ * do_blkif_request
++ *  read a block; request is in a request queue
++ */
++void do_blkif_request(struct request_queue *rq)
++{
++      struct blkfront_info *info = NULL;
++      struct request *req;
++      int queued;
++
++      DPRINTK("Entered do_blkif_request\n");
++
++      queued = 0;
++
++      while ((req = blk_peek_request(rq)) != NULL) {
++              info = req->rq_disk->private_data;
++
++              if (RING_FULL(&info->ring))
++                      goto wait;
++
++              blk_start_request(req);
++
++              if (req->cmd_type != REQ_TYPE_FS
++                  && req->cmd_type != REQ_TYPE_BLOCK_PC) {
++                      __blk_end_request_all(req, -EIO);
++                      continue;
++              }
++
++              DPRINTK("do_blk_req %p: cmd %p, sec %llx, "
++                      "(%u/%u) buffer:%p [%s]\n",
++                      req, req->cmd, (long long)blk_rq_pos(req),
++                      blk_rq_cur_sectors(req), blk_rq_sectors(req),
++                      req->buffer, rq_data_dir(req) ? "write" : "read");
++
++              if (blkif_queue_request(req)) {
++                      blk_requeue_request(rq, req);
++              wait:
++                      /* Avoid pointless unplugs. */
++                      blk_stop_queue(rq);
++                      break;
++              }
++
++              queued++;
++      }
++
++      if (queued != 0)
++              flush_requests(info);
++}
++
++
++static irqreturn_t blkif_int(int irq, void *dev_id)
++{
++      struct request *req;
++      blkif_response_t *bret;
++      RING_IDX i, rp;
++      unsigned long flags;
++      struct blkfront_info *info = (struct blkfront_info *)dev_id;
++
++      spin_lock_irqsave(&blkif_io_lock, flags);
++
++      if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
++              spin_unlock_irqrestore(&blkif_io_lock, flags);
++              return IRQ_HANDLED;
++      }
++
++ again:
++      rp = info->ring.sring->rsp_prod;
++      rmb(); /* Ensure we see queued responses up to 'rp'. */
++
++      for (i = info->ring.rsp_cons; i != rp; i++) {
++              unsigned long id;
++              int ret;
++
++              bret = RING_GET_RESPONSE(&info->ring, i);
++              id   = bret->id;
++              req  = info->shadow[id].request;
++
++              blkif_completion(&info->shadow[id]);
++
++              ADD_ID_TO_FREELIST(info, id);
++
++              ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
++              switch (bret->operation) {
++              case BLKIF_OP_WRITE_BARRIER:
++                      if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
++                              pr_warning("blkfront: %s:"
++                                         " write barrier op failed\n",
++                                         info->gd->disk_name);
++                              ret = -EOPNOTSUPP;
++                      }
++                      if (unlikely(bret->status == BLKIF_RSP_ERROR &&
++                                   info->shadow[id].req.nr_segments == 0)) {
++                              pr_warning("blkfront: %s:"
++                                         " empty write barrier op failed\n",
++                                         info->gd->disk_name);
++                              ret = -EOPNOTSUPP;
++                      }
++                      if (unlikely(ret)) {
++                              if (ret == -EOPNOTSUPP)
++                                      ret = 0;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
++                              info->feature_flush = 0;
++#else
++                              info->feature_flush = QUEUE_ORDERED_NONE;
++#endif
++                              xlvbd_flush(info);
++                      }
++                      /* fall through */
++              case BLKIF_OP_READ:
++              case BLKIF_OP_WRITE:
++              case BLKIF_OP_PACKET:
++                      if (unlikely(bret->status != BLKIF_RSP_OKAY))
++                              DPRINTK("Bad return from blkdev data "
++                                      "request: %x\n", bret->status);
++
++                      __blk_end_request_all(req, ret);
++                      break;
++              default:
++                      BUG();
++              }
++      }
++
++      info->ring.rsp_cons = i;
++
++      if (i != info->ring.req_prod_pvt) {
++              int more_to_do;
++              RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
++              if (more_to_do)
++                      goto again;
++      } else
++              info->ring.sring->rsp_event = i + 1;
++
++      kick_pending_request_queues(info);
++
++      spin_unlock_irqrestore(&blkif_io_lock, flags);
++
++      return IRQ_HANDLED;
++}
++
++static void blkif_free(struct blkfront_info *info, int suspend)
++{
++      /* Prevent new requests being issued until we fix things up. */
++      spin_lock_irq(&blkif_io_lock);
++      info->connected = suspend ?
++              BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
++      /* No more blkif_request(). */
++      if (info->rq)
++              blk_stop_queue(info->rq);
++      /* No more gnttab callback work. */
++      gnttab_cancel_free_callback(&info->callback);
++      spin_unlock_irq(&blkif_io_lock);
++
++      /* Flush gnttab callback work. Must be done with no locks held. */
++      flush_work_sync(&info->work);
++
++      /* Free resources associated with old device channel. */
++      if (info->ring_ref != GRANT_INVALID_REF) {
++              gnttab_end_foreign_access(info->ring_ref, 
++                                        (unsigned long)info->ring.sring);
++              info->ring_ref = GRANT_INVALID_REF;
++              info->ring.sring = NULL;
++      }
++      if (info->irq)
++              unbind_from_irqhandler(info->irq, info);
++      info->irq = 0;
++}
++
++static void blkif_completion(struct blk_shadow *s)
++{
++      int i;
++      for (i = 0; i < s->req.nr_segments; i++)
++              gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
++}
++
++static int blkif_recover(struct blkfront_info *info)
++{
++      int i;
++      blkif_request_t *req;
++      struct blk_shadow *copy;
++      int j;
++
++      /* Stage 1: Make a safe copy of the shadow state. */
++      copy = kmemdup(info->shadow, sizeof(info->shadow),
++                     GFP_NOIO | __GFP_NOFAIL | __GFP_HIGH);
++      if (!copy)
++              return -ENOMEM;
++
++      /* Stage 2: Set up free list. */
++      memset(&info->shadow, 0, sizeof(info->shadow));
++      for (i = 0; i < BLK_RING_SIZE; i++)
++              info->shadow[i].req.id = i+1;
++      info->shadow_free = info->ring.req_prod_pvt;
++      info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
++
++      /* Stage 3: Find pending requests and requeue them. */
++      for (i = 0; i < BLK_RING_SIZE; i++) {
++              /* Not in use? */
++              if (!copy[i].request)
++                      continue;
++
++              /* Grab a request slot and copy shadow state into it. */
++              req = RING_GET_REQUEST(
++                      &info->ring, info->ring.req_prod_pvt);
++              *req = copy[i].req;
++
++              /* We get a new request id, and must reset the shadow state. */
++              req->id = GET_ID_FROM_FREELIST(info);
++              memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
++
++              /* Rewrite any grant references invalidated by susp/resume. */
++              for (j = 0; j < req->nr_segments; j++)
++                      gnttab_grant_foreign_access_ref(
++                              req->seg[j].gref,
++                              info->xbdev->otherend_id,
++                              pfn_to_mfn(info->shadow[req->id].frame[j]),
++                              rq_data_dir(info->shadow[req->id].request) ?
++                              GTF_readonly : 0);
++              info->shadow[req->id].req = *req;
++
++              info->ring.req_prod_pvt++;
++      }
++
++      kfree(copy);
++
++      (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
++
++      spin_lock_irq(&blkif_io_lock);
++
++      /* Now safe for us to use the shared ring */
++      info->connected = BLKIF_STATE_CONNECTED;
++
++      /* Send off requeued requests */
++      flush_requests(info);
++
++      /* Kick any other new requests queued since we resumed */
++      kick_pending_request_queues(info);
++
++      spin_unlock_irq(&blkif_io_lock);
++
++      return 0;
++}
++
++int blkfront_is_ready(struct xenbus_device *dev)
++{
++      struct blkfront_info *info = dev_get_drvdata(&dev->dev);
++
++      return info->is_ready && info->xbdev;
++}
++
++
++/* ** Driver Registration ** */
++
++
++static const struct xenbus_device_id blkfront_ids[] = {
++      { "vbd" },
++      { "" }
++};
++MODULE_ALIAS("xen:vbd");
++
++static struct xenbus_driver blkfront = {
++      .name = "vbd",
++      .ids = blkfront_ids,
++      .probe = blkfront_probe,
++      .remove = blkfront_remove,
++      .resume = blkfront_resume,
++      .otherend_changed = backend_changed,
++      .is_ready = blkfront_is_ready,
++};
++
++
++static int __init xlblk_init(void)
++{
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      return xenbus_register_frontend(&blkfront);
++}
++module_init(xlblk_init);
++
++
++static void __exit xlblk_exit(void)
++{
++      return xenbus_unregister_driver(&blkfront);
++}
++module_exit(xlblk_exit);
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/blkfront/block.h

index 0000000,0000000..f32faae

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkfront/block.h
@@@ -1,0 -1,0 +1,170 @@@
++/******************************************************************************
++ * block.h
++ * 
++ * Shared definitions between all levels of XenLinux Virtual block devices.
++ * 
++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
++ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
++ * Copyright (c) 2004-2005, Christian Limpach
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_DRIVERS_BLOCK_H__
++#define __XEN_DRIVERS_BLOCK_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/string.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/hdreg.h>
++#include <linux/blkdev.h>
++#include <linux/major.h>
++#include <asm/hypervisor.h>
++#include <xen/xenbus.h>
++#include <xen/gnttab.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/io/blkif.h>
++#include <xen/interface/io/ring.h>
++#include <asm/io.h>
++#include <asm/atomic.h>
++#include <asm/uaccess.h>
++
++#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
++
++#if 0
++#define DPRINTK_IOCTL(_f, _a...) pr_alert(_f, ## _a)
++#else
++#define DPRINTK_IOCTL(_f, _a...) ((void)0)
++#endif
++
++struct xlbd_type_info
++{
++      int partn_shift;
++      int disks_per_major;
++      char *devname;
++      char *diskname;
++};
++
++struct xlbd_major_info
++{
++      int major;
++      int index;
++      int usage;
++      struct xlbd_type_info *type;
++      struct xlbd_minor_state *minors;
++};
++
++struct blk_shadow {
++      blkif_request_t req;
++      struct request *request;
++      unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++};
++
++#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
++
++/*
++ * We have one of these per vbd, whether ide, scsi or 'other'.  They
++ * hang in private_data off the gendisk structure. We may end up
++ * putting all kinds of interesting stuff here :-)
++ */
++struct blkfront_info
++{
++      struct xenbus_device *xbdev;
++      struct gendisk *gd;
++      int vdevice;
++      blkif_vdev_t handle;
++      int connected;
++      int ring_ref;
++      blkif_front_ring_t ring;
++      struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++      unsigned int irq;
++      struct xlbd_major_info *mi;
++      struct request_queue *rq;
++      struct work_struct work;
++      struct gnttab_free_callback callback;
++      struct blk_shadow shadow[BLK_RING_SIZE];
++      unsigned long shadow_free;
++      int feature_flush;
++      int is_ready;
++
++      /**
++       * The number of people holding this device open.  We won't allow a
++       * hot-unplug unless this is 0.
++       */
++      int users;
++};
++
++extern spinlock_t blkif_io_lock;
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++extern int blkif_open(struct inode *inode, struct file *filep);
++extern int blkif_release(struct inode *inode, struct file *filep);
++extern int blkif_ioctl(struct inode *inode, struct file *filep,
++                     unsigned command, unsigned long argument);
++#else
++extern int blkif_open(struct block_device *bdev, fmode_t mode);
++extern int blkif_release(struct gendisk *disk, fmode_t mode);
++extern int blkif_ioctl(struct block_device *bdev, fmode_t mode,
++                     unsigned command, unsigned long argument);
++#endif
++extern int blkif_getgeo(struct block_device *, struct hd_geometry *);
++extern int blkif_check(dev_t dev);
++extern int blkif_revalidate(dev_t dev);
++extern void do_blkif_request (struct request_queue *rq);
++
++/* Virtual block-device subsystem. */
++/* Note that xlvbd_add doesn't call add_disk for you: you're expected
++   to call add_disk on info->gd once the disk is properly connected
++   up. */
++int xlvbd_add(blkif_sector_t capacity, int device,
++            u16 vdisk_info, u16 sector_size, struct blkfront_info *info);
++void xlvbd_del(struct blkfront_info *info);
++void xlvbd_flush(struct blkfront_info *info);
++
++#ifdef CONFIG_SYSFS
++int xlvbd_sysfs_addif(struct blkfront_info *info);
++void xlvbd_sysfs_delif(struct blkfront_info *info);
++#else
++static inline int xlvbd_sysfs_addif(struct blkfront_info *info)
++{
++      return 0;
++}
++
++static inline void xlvbd_sysfs_delif(struct blkfront_info *info)
++{
++      ;
++}
++#endif
++
++/* Virtual cdrom block-device */
++extern void register_vcd(struct blkfront_info *info);
++extern void unregister_vcd(struct blkfront_info *info);
++
++#endif /* __XEN_DRIVERS_BLOCK_H__ */
diff --cc drivers/xen/blkfront/vbd.c

index 0000000,0000000..3ec0134

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkfront/vbd.c
@@@ -1,0 -1,0 +1,545 @@@
++/******************************************************************************
++ * vbd.c
++ * 
++ * XenLinux virtual block-device driver (xvd).
++ * 
++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
++ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
++ * Copyright (c) 2004-2005, Christian Limpach
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "block.h"
++#include <linux/blkdev.h>
++#include <linux/list.h>
++
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
++#define BLKIF_MAJOR(dev) ((dev)>>8)
++#define BLKIF_MINOR(dev) ((dev) & 0xff)
++
++#define EXT_SHIFT 28
++#define EXTENDED (1<<EXT_SHIFT)
++#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
++#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
++
++struct xlbd_minor_state {
++      unsigned int nr;
++      unsigned long *bitmap;
++      spinlock_t lock;
++};
++
++/*
++ * For convenience we distinguish between ide, scsi and 'other' (i.e.,
++ * potentially combinations of the two) in the naming scheme and in a few other
++ * places.
++ */
++
++#define NUM_IDE_MAJORS 10
++#define NUM_SCSI_MAJORS 17
++#define NUM_VBD_MAJORS 2
++
++static struct xlbd_type_info xlbd_ide_type = {
++      .partn_shift = 6,
++      .disks_per_major = 2,
++      .devname = "ide",
++      .diskname = "hd",
++};
++
++static struct xlbd_type_info xlbd_scsi_type = {
++      .partn_shift = 4,
++      .disks_per_major = 16,
++      .devname = "sd",
++      .diskname = "sd",
++};
++
++static struct xlbd_type_info xlbd_vbd_type = {
++      .partn_shift = 4,
++      .disks_per_major = 16,
++      .devname = "xvd",
++      .diskname = "xvd",
++};
++
++static struct xlbd_type_info xlbd_vbd_type_ext = {
++      .partn_shift = 8,
++      .disks_per_major = 256,
++      .devname = "xvd",
++      .diskname = "xvd",
++};
++
++static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS +
++                                       NUM_VBD_MAJORS];
++
++#define XLBD_MAJOR_IDE_START  0
++#define XLBD_MAJOR_SCSI_START (NUM_IDE_MAJORS)
++#define XLBD_MAJOR_VBD_START  (NUM_IDE_MAJORS + NUM_SCSI_MAJORS)
++
++#define XLBD_MAJOR_IDE_RANGE  XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1
++#define XLBD_MAJOR_SCSI_RANGE XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1
++#define XLBD_MAJOR_VBD_RANGE  XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1
++
++#define XLBD_MAJOR_VBD_ALT(idx) ((idx) ^ XLBD_MAJOR_VBD_START ^ (XLBD_MAJOR_VBD_START + 1))
++
++static const struct block_device_operations xlvbd_block_fops =
++{
++      .owner = THIS_MODULE,
++      .open = blkif_open,
++      .release = blkif_release,
++      .ioctl  = blkif_ioctl,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
++      .getgeo = blkif_getgeo
++#endif
++};
++
++DEFINE_SPINLOCK(blkif_io_lock);
++
++static struct xlbd_major_info *
++xlbd_alloc_major_info(int major, int minor, int index)
++{
++      struct xlbd_major_info *ptr;
++      struct xlbd_minor_state *minors;
++      int do_register;
++
++      ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
++      if (ptr == NULL)
++              return NULL;
++
++      ptr->major = major;
++      minors = kmalloc(sizeof(*minors), GFP_KERNEL);
++      if (minors == NULL) {
++              kfree(ptr);
++              return NULL;
++      }
++
++      minors->bitmap = kzalloc(BITS_TO_LONGS(256) * sizeof(*minors->bitmap),
++                               GFP_KERNEL);
++      if (minors->bitmap == NULL) {
++              kfree(minors);
++              kfree(ptr);
++              return NULL;
++      }
++
++      spin_lock_init(&minors->lock);
++      minors->nr = 256;
++      do_register = 1;
++
++      switch (index) {
++      case XLBD_MAJOR_IDE_RANGE:
++              ptr->type = &xlbd_ide_type;
++              ptr->index = index - XLBD_MAJOR_IDE_START;
++              break;
++      case XLBD_MAJOR_SCSI_RANGE:
++              ptr->type = &xlbd_scsi_type;
++              ptr->index = index - XLBD_MAJOR_SCSI_START;
++              break;
++      case XLBD_MAJOR_VBD_RANGE:
++              ptr->index = 0;
++              if ((index - XLBD_MAJOR_VBD_START) == 0)
++                      ptr->type = &xlbd_vbd_type;
++              else
++                      ptr->type = &xlbd_vbd_type_ext;
++
++              /* 
++               * if someone already registered block major 202,
++               * don't try to register it again
++               */
++              if (major_info[XLBD_MAJOR_VBD_ALT(index)] != NULL) {
++                      kfree(minors->bitmap);
++                      kfree(minors);
++                      minors = major_info[XLBD_MAJOR_VBD_ALT(index)]->minors;
++                      do_register = 0;
++              }
++              break;
++      }
++
++      if (do_register) {
++              if (register_blkdev(ptr->major, ptr->type->devname)) {
++                      kfree(minors->bitmap);
++                      kfree(minors);
++                      kfree(ptr);
++                      return NULL;
++              }
++
++              pr_info("xen-vbd: registered block device major %i\n",
++                      ptr->major);
++      }
++
++      ptr->minors = minors;
++      major_info[index] = ptr;
++      return ptr;
++}
++
++static struct xlbd_major_info *
++xlbd_get_major_info(int major, int minor, int vdevice)
++{
++      struct xlbd_major_info *mi;
++      int index;
++
++      switch (major) {
++      case IDE0_MAJOR: index = 0; break;
++      case IDE1_MAJOR: index = 1; break;
++      case IDE2_MAJOR: index = 2; break;
++      case IDE3_MAJOR: index = 3; break;
++      case IDE4_MAJOR: index = 4; break;
++      case IDE5_MAJOR: index = 5; break;
++      case IDE6_MAJOR: index = 6; break;
++      case IDE7_MAJOR: index = 7; break;
++      case IDE8_MAJOR: index = 8; break;
++      case IDE9_MAJOR: index = 9; break;
++      case SCSI_DISK0_MAJOR: index = 10; break;
++      case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
++              index = 11 + major - SCSI_DISK1_MAJOR;
++              break;
++        case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR:
++                index = 18 + major - SCSI_DISK8_MAJOR;
++                break;
++        case SCSI_CDROM_MAJOR: index = 26; break;
++        default:
++              if (!VDEV_IS_EXTENDED(vdevice))
++                      index = 27;
++              else
++                      index = 28;
++              break;
++      }
++
++      mi = ((major_info[index] != NULL) ? major_info[index] :
++            xlbd_alloc_major_info(major, minor, index));
++      if (mi)
++              mi->usage++;
++      return mi;
++}
++
++static void
++xlbd_put_major_info(struct xlbd_major_info *mi)
++{
++      mi->usage--;
++      /* XXX: release major if 0 */
++}
++
++static int
++xlbd_reserve_minors(struct xlbd_major_info *mi, unsigned int minor,
++                  unsigned int nr_minors)
++{
++      struct xlbd_minor_state *ms = mi->minors;
++      unsigned int end = minor + nr_minors;
++      int rc;
++
++      if (end > ms->nr) {
++              unsigned long *bitmap, *old;
++
++              bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
++                               GFP_KERNEL);
++              if (bitmap == NULL)
++                      return -ENOMEM;
++
++              spin_lock(&ms->lock);
++              if (end > ms->nr) {
++                      old = ms->bitmap;
++                      memcpy(bitmap, ms->bitmap,
++                             BITS_TO_LONGS(ms->nr) * sizeof(*bitmap));
++                      ms->bitmap = bitmap;
++                      ms->nr = BITS_TO_LONGS(end) * BITS_PER_LONG;
++              } else
++                      old = bitmap;
++              spin_unlock(&ms->lock);
++              kfree(old);
++      }
++
++      spin_lock(&ms->lock);
++      if (find_next_bit(ms->bitmap, end, minor) >= end) {
++              for (; minor < end; ++minor)
++                      __set_bit(minor, ms->bitmap);
++              rc = 0;
++      } else
++              rc = -EBUSY;
++      spin_unlock(&ms->lock);
++
++      return rc;
++}
++
++static void
++xlbd_release_minors(struct xlbd_major_info *mi, unsigned int minor,
++                  unsigned int nr_minors)
++{
++      struct xlbd_minor_state *ms = mi->minors;
++      unsigned int end = minor + nr_minors;
++
++      BUG_ON(end > ms->nr);
++      spin_lock(&ms->lock);
++      for (; minor < end; ++minor)
++              __clear_bit(minor, ms->bitmap);
++      spin_unlock(&ms->lock);
++}
++
++static int
++xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
++{
++      struct request_queue *rq;
++
++      rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
++      if (rq == NULL)
++              return -1;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
++      queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
++#endif
++
++      /* Hard sector size and max sectors impersonate the equiv. hardware. */
++      blk_queue_logical_block_size(rq, sector_size);
++      blk_queue_max_hw_sectors(rq, 512);
++
++      /* Each segment in a request is up to an aligned page in size. */
++      blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
++      blk_queue_max_segment_size(rq, PAGE_SIZE);
++
++      /* Ensure a merged request will fit in a single I/O ring slot. */
++      blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++
++      /* Make sure buffer addresses are sector-aligned. */
++      blk_queue_dma_alignment(rq, 511);
++
++      /* Make sure we don't use bounce buffers. */
++      blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
++
++      gd->queue = rq;
++
++      return 0;
++}
++
++int
++xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
++        u16 sector_size, struct blkfront_info *info)
++{
++      int major, minor;
++      struct gendisk *gd;
++      struct xlbd_major_info *mi;
++      int nr_minors = 1;
++      int err = -ENODEV;
++      unsigned int offset;
++
++      if ((vdevice>>EXT_SHIFT) > 1) {
++              /* this is above the extended range; something is wrong */
++              pr_warning("blkfront: vdevice %#x is above the extended range;"
++                         " ignoring\n", vdevice);
++              return -ENODEV;
++      }
++
++      if (!VDEV_IS_EXTENDED(vdevice)) {
++              major = BLKIF_MAJOR(vdevice);
++              minor = BLKIF_MINOR(vdevice);
++      }
++      else {
++              major = 202;
++              minor = BLKIF_MINOR_EXT(vdevice);
++      }
++
++      BUG_ON(info->gd != NULL);
++      BUG_ON(info->mi != NULL);
++      BUG_ON(info->rq != NULL);
++
++      mi = xlbd_get_major_info(major, minor, vdevice);
++      if (mi == NULL)
++              goto out;
++      info->mi = mi;
++
++      if (!(vdisk_info & VDISK_CDROM) &&
++          (minor & ((1 << mi->type->partn_shift) - 1)) == 0)
++              nr_minors = 1 << mi->type->partn_shift;
++
++      err = xlbd_reserve_minors(mi, minor, nr_minors);
++      if (err)
++              goto out;
++      err = -ENODEV;
++
++      gd = alloc_disk(nr_minors);
++      if (gd == NULL)
++              goto release;
++
++      offset =  mi->index * mi->type->disks_per_major +
++                      (minor >> mi->type->partn_shift);
++      if (nr_minors > 1 || (vdisk_info & VDISK_CDROM)) {
++              if (offset < 26) {
++                      sprintf(gd->disk_name, "%s%c",
++                               mi->type->diskname, 'a' + offset );
++              }
++              else {
++                      sprintf(gd->disk_name, "%s%c%c",
++                              mi->type->diskname,
++                              'a' + ((offset/26)-1), 'a' + (offset%26) );
++              }
++      }
++      else {
++              if (offset < 26) {
++                      sprintf(gd->disk_name, "%s%c%d",
++                              mi->type->diskname,
++                              'a' + offset,
++                              minor & ((1 << mi->type->partn_shift) - 1));
++              }
++              else {
++                      sprintf(gd->disk_name, "%s%c%c%d",
++                              mi->type->diskname,
++                              'a' + ((offset/26)-1), 'a' + (offset%26),
++                              minor & ((1 << mi->type->partn_shift) - 1));
++              }
++      }
++
++      gd->major = mi->major;
++      gd->first_minor = minor;
++      gd->fops = &xlvbd_block_fops;
++      gd->private_data = info;
++      gd->driverfs_dev = &(info->xbdev->dev);
++      set_capacity(gd, capacity);
++
++      if (xlvbd_init_blk_queue(gd, sector_size)) {
++              del_gendisk(gd);
++              goto release;
++      }
++
++      info->rq = gd->queue;
++      info->gd = gd;
++
++      xlvbd_flush(info);
++
++      if (vdisk_info & VDISK_READONLY)
++              set_disk_ro(gd, 1);
++
++      if (vdisk_info & VDISK_REMOVABLE)
++              gd->flags |= GENHD_FL_REMOVABLE;
++
++      if (vdisk_info & VDISK_CDROM)
++              gd->flags |= GENHD_FL_CD;
++
++      return 0;
++
++ release:
++      xlbd_release_minors(mi, minor, nr_minors);
++ out:
++      if (mi)
++              xlbd_put_major_info(mi);
++      info->mi = NULL;
++      return err;
++}
++
++void
++xlvbd_del(struct blkfront_info *info)
++{
++      unsigned int minor, nr_minors;
++
++      if (info->mi == NULL)
++              return;
++
++      BUG_ON(info->gd == NULL);
++      minor = info->gd->first_minor;
++      nr_minors = info->gd->minors;
++      del_gendisk(info->gd);
++      put_disk(info->gd);
++      info->gd = NULL;
++
++      xlbd_release_minors(info->mi, minor, nr_minors);
++      xlbd_put_major_info(info->mi);
++      info->mi = NULL;
++
++      BUG_ON(info->rq == NULL);
++      blk_cleanup_queue(info->rq);
++      info->rq = NULL;
++}
++
++void
++xlvbd_flush(struct blkfront_info *info)
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
++      blk_queue_flush(info->rq, info->feature_flush);
++      pr_info("blkfront: %s: barriers %s\n",
++              info->gd->disk_name,
++              info->feature_flush ? "enabled" : "disabled");
++#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
++      int err;
++      const char *barrier;
++
++      switch (info->feature_flush) {
++      case QUEUE_ORDERED_DRAIN:       barrier = "enabled (drain)"; break;
++      case QUEUE_ORDERED_TAG:         barrier = "enabled (tag)"; break;
++      case QUEUE_ORDERED_NONE:        barrier = "disabled"; break;
++      default:                        return -EINVAL;
++      }
++
++      err = blk_queue_ordered(info->rq, info->feature_flush);
++      if (err)
++              return err;
++      pr_info("blkfront: %s: barriers %s\n",
++              info->gd->disk_name, barrier);
++#else
++      if (info->feature_flush)
++              pr_info("blkfront: %s: barriers disabled\n", info->gd->disk_name);
++#endif
++}
++
++#ifdef CONFIG_SYSFS
++static ssize_t show_media(struct device *dev,
++                                struct device_attribute *attr, char *buf)
++{
++      struct xenbus_device *xendev = to_xenbus_device(dev);
++      struct blkfront_info *info = dev_get_drvdata(&xendev->dev);
++
++      if (info->gd->flags & GENHD_FL_CD)
++              return sprintf(buf, "cdrom\n");
++      return sprintf(buf, "disk\n");
++}
++
++static struct device_attribute xlvbd_attrs[] = {
++      __ATTR(media, S_IRUGO, show_media, NULL),
++};
++
++int xlvbd_sysfs_addif(struct blkfront_info *info)
++{
++      int i;
++      int error = 0;
++
++      for (i = 0; i < ARRAY_SIZE(xlvbd_attrs); i++) {
++              error = device_create_file(info->gd->driverfs_dev,
++                              &xlvbd_attrs[i]);
++              if (error)
++                      goto fail;
++      }
++      return 0;
++
++fail:
++      while (--i >= 0)
++              device_remove_file(info->gd->driverfs_dev, &xlvbd_attrs[i]);
++      return error;
++}
++
++void xlvbd_sysfs_delif(struct blkfront_info *info)
++{
++      int i;
++
++      for (i = 0; i < ARRAY_SIZE(xlvbd_attrs); i++)
++              device_remove_file(info->gd->driverfs_dev, &xlvbd_attrs[i]);
++}
++
++#endif /* CONFIG_SYSFS */
diff --cc drivers/xen/blkfront/vcd.c

index 0000000,0000000..63f7b52

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blkfront/vcd.c
@@@ -1,0 -1,0 +1,507 @@@
++/*******************************************************************************
++ * vcd.c
++ *
++ * Implements CDROM cmd packet passing between frontend guest and backend driver.
++ *
++ * Copyright (c) 2008, Pat Campell  plc@novell.com
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#define REVISION "$Revision: 1.0 $"
++
++#include <linux/module.h>
++#include <linux/blkdev.h>
++#include <linux/list.h>
++#include <linux/cdrom.h>
++#include <xen/interface/io/cdromif.h>
++#include "block.h"
++
++/* List of cdrom_device_info, can have as many as blkfront supports */
++struct vcd_disk {
++      struct list_head vcd_entry;
++      struct cdrom_device_info vcd_cdrom_info;
++      spinlock_t vcd_cdrom_info_lock;
++};
++static LIST_HEAD(vcd_disks);
++static DEFINE_SPINLOCK(vcd_disks_lock);
++
++static struct vcd_disk *xencdrom_get_list_entry(struct gendisk *disk)
++{
++      struct vcd_disk *ret_vcd = NULL;
++      struct vcd_disk *vcd;
++
++      spin_lock(&vcd_disks_lock);
++      list_for_each_entry(vcd, &vcd_disks, vcd_entry) {
++              if (vcd->vcd_cdrom_info.disk == disk) {
++                      spin_lock(&vcd->vcd_cdrom_info_lock);
++                      ret_vcd = vcd;
++                      break;
++              }
++      }
++      spin_unlock(&vcd_disks_lock);
++      return ret_vcd;
++}
++
++static void submit_message(struct blkfront_info *info, void *sp)
++{
++      struct request *req = NULL;
++
++      req = blk_get_request(info->rq, READ, __GFP_WAIT);
++      if (blk_rq_map_kern(info->rq, req, sp, PAGE_SIZE, __GFP_WAIT))
++              goto out;
++
++      req->rq_disk = info->gd;
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
++      req->cmd_type = REQ_TYPE_BLOCK_PC;
++      req->cmd_flags |= REQ_NOMERGE;
++#else
++      req->flags |= REQ_BLOCK_PC;
++#endif
++      req->__sector = 0;
++      req->__data_len = PAGE_SIZE;
++      req->timeout = 60*HZ;
++
++      blk_execute_rq(req->q, info->gd, req, 1);
++
++out:
++      blk_put_request(req);
++}
++
++static int submit_cdrom_cmd(struct blkfront_info *info,
++                          struct packet_command *cgc)
++{
++      int ret = 0;
++      struct page *page;
++      size_t size;
++      union xen_block_packet *sp;
++      struct xen_cdrom_packet *xcp;
++      struct vcd_generic_command *vgc;
++
++      if (cgc->buffer && cgc->buflen > MAX_PACKET_DATA) {
++              pr_warn("%s() Packet buffer length is to large \n", __func__);
++              return -EIO;
++      }
++
++      page = alloc_page(GFP_NOIO|__GFP_ZERO);
++      if (!page) {
++              pr_crit("%s() Unable to allocate page\n", __func__);
++              return -ENOMEM;
++      }
++
++      size = PAGE_SIZE;
++      sp = page_address(page);
++      xcp = &(sp->xcp);
++      xcp->type = XEN_TYPE_CDROM_PACKET;
++      xcp->payload_offset = PACKET_PAYLOAD_OFFSET;
++
++      vgc = (struct vcd_generic_command *)((char *)sp + xcp->payload_offset);
++      memcpy(vgc->cmd, cgc->cmd, CDROM_PACKET_SIZE);
++      vgc->stat = cgc->stat;
++      vgc->data_direction = cgc->data_direction;
++      vgc->quiet = cgc->quiet;
++      vgc->timeout = cgc->timeout;
++      if (cgc->sense) {
++              vgc->sense_offset = PACKET_SENSE_OFFSET;
++              memcpy((char *)sp + vgc->sense_offset, cgc->sense, sizeof(struct request_sense));
++      }
++      if (cgc->buffer) {
++              vgc->buffer_offset = PACKET_BUFFER_OFFSET;
++              memcpy((char *)sp + vgc->buffer_offset, cgc->buffer, cgc->buflen);
++              vgc->buflen = cgc->buflen;
++      }
++
++      submit_message(info,sp);
++
++      if (xcp->ret)
++              ret = xcp->err;
++
++      if (cgc->sense) {
++              memcpy(cgc->sense, (char *)sp + PACKET_SENSE_OFFSET, sizeof(struct request_sense));
++      }
++      if (cgc->buffer && cgc->buflen) {
++              memcpy(cgc->buffer, (char *)sp + PACKET_BUFFER_OFFSET, cgc->buflen);
++      }
++
++      __free_page(page);
++      return ret;
++}
++
++
++static int xencdrom_open(struct cdrom_device_info *cdi, int purpose)
++{
++      int ret = 0;
++      struct page *page;
++      struct blkfront_info *info;
++      union xen_block_packet *sp;
++      struct xen_cdrom_open *xco;
++
++      info = cdi->disk->private_data;
++
++      if (!info->xbdev)
++              return -ENODEV;
++
++      if (strlen(info->xbdev->otherend) > MAX_PACKET_DATA) {
++              return -EIO;
++      }
++
++      page = alloc_page(GFP_NOIO|__GFP_ZERO);
++      if (!page) {
++              pr_crit("%s() Unable to allocate page\n", __func__);
++              return -ENOMEM;
++      }
++
++      sp = page_address(page);
++      xco = &(sp->xco);
++      xco->type = XEN_TYPE_CDROM_OPEN;
++      xco->payload_offset = sizeof(struct xen_cdrom_open);
++      strcpy((char *)sp + xco->payload_offset, info->xbdev->otherend);
++
++      submit_message(info,sp);
++
++      if (xco->ret) {
++              ret = xco->err;
++              goto out;
++      }
++
++      if (xco->media_present)
++              set_capacity(cdi->disk, xco->sectors);
++
++out:
++      __free_page(page);
++      return ret;
++}
++
++static void xencdrom_release(struct cdrom_device_info *cdi)
++{
++}
++
++static int xencdrom_media_changed(struct cdrom_device_info *cdi, int disc_nr)
++{
++      int ret;
++      struct page *page;
++      struct blkfront_info *info;
++      union xen_block_packet *sp;
++      struct xen_cdrom_media_changed *xcmc;
++
++      info = cdi->disk->private_data;
++
++      page = alloc_page(GFP_NOIO|__GFP_ZERO);
++      if (!page) {
++              pr_crit("%s() Unable to allocate page\n", __func__);
++              return -ENOMEM;
++      }
++
++      sp = page_address(page);
++      xcmc = &(sp->xcmc);
++      xcmc->type = XEN_TYPE_CDROM_MEDIA_CHANGED;
++      submit_message(info,sp);
++      ret = xcmc->media_changed;
++
++      __free_page(page);
++
++      return ret;
++}
++
++static int xencdrom_tray_move(struct cdrom_device_info *cdi, int position)
++{
++      int ret;
++      struct packet_command cgc;
++      struct blkfront_info *info;
++
++      info = cdi->disk->private_data;
++      init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
++      cgc.cmd[0] = GPCMD_START_STOP_UNIT;
++      if (position)
++              cgc.cmd[4] = 2;
++      else
++              cgc.cmd[4] = 3;
++      ret = submit_cdrom_cmd(info, &cgc);
++      return ret;
++}
++
++static int xencdrom_lock_door(struct cdrom_device_info *cdi, int lock)
++{
++      int ret = 0;
++      struct blkfront_info *info;
++      struct packet_command cgc;
++
++      info = cdi->disk->private_data;
++      init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
++      cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
++      cgc.cmd[4] = lock;
++      ret = submit_cdrom_cmd(info, &cgc);
++      return ret;
++}
++
++static int xencdrom_packet(struct cdrom_device_info *cdi,
++              struct packet_command *cgc)
++{
++      int ret = -EIO;
++      struct blkfront_info *info;
++
++      info = cdi->disk->private_data;
++      ret = submit_cdrom_cmd(info, cgc);
++      cgc->stat = ret;
++      return ret;
++}
++
++static int xencdrom_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
++              void *arg)
++{
++      return -EINVAL;
++}
++
++/* Query backend to see if CDROM packets are supported */
++static int xencdrom_supported(struct blkfront_info *info)
++{
++      struct page *page;
++      union xen_block_packet *sp;
++      struct xen_cdrom_support *xcs;
++
++      page = alloc_page(GFP_NOIO|__GFP_ZERO);
++      if (!page) {
++              pr_crit("%s() Unable to allocate page\n", __func__);
++              return -ENOMEM;
++      }
++
++      sp = page_address(page);
++      xcs = &(sp->xcs);
++      xcs->type = XEN_TYPE_CDROM_SUPPORT;
++      submit_message(info,sp);
++      return xcs->supported;
++}
++
++static struct cdrom_device_ops xencdrom_dops = {
++    .open           = xencdrom_open,
++    .release        = xencdrom_release,
++    .media_changed  = xencdrom_media_changed,
++    .tray_move      = xencdrom_tray_move,
++    .lock_door      = xencdrom_lock_door,
++    .generic_packet = xencdrom_packet,
++    .audio_ioctl    = xencdrom_audio_ioctl,
++    .capability     = (CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK | \
++                       CDC_MEDIA_CHANGED | CDC_GENERIC_PACKET |  CDC_DVD | \
++                       CDC_CD_R),
++    .n_minors       = 1,
++};
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++static int xencdrom_block_open(struct inode *inode, struct file *file)
++{
++      struct block_device *bd = inode->i_bdev;
++#else
++static int xencdrom_block_open(struct block_device *bd, fmode_t mode)
++{
++#endif
++      struct blkfront_info *info = bd->bd_disk->private_data;
++      struct vcd_disk *vcd;
++      int ret = 0;
++
++      if (!info->xbdev)
++              return -ENODEV;
++
++      if ((vcd = xencdrom_get_list_entry(info->gd))) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++              ret = cdrom_open(&vcd->vcd_cdrom_info, inode, file);
++#else
++              ret = cdrom_open(&vcd->vcd_cdrom_info, bd, mode);
++#endif
++              info->users = vcd->vcd_cdrom_info.use_count;
++              spin_unlock(&vcd->vcd_cdrom_info_lock);
++      }
++      return ret;
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++static int xencdrom_block_release(struct inode *inode, struct file *file)
++{
++      struct gendisk *gd = inode->i_bdev->bd_disk;
++#else
++static int xencdrom_block_release(struct gendisk *gd, fmode_t mode)
++{
++#endif
++      struct blkfront_info *info = gd->private_data;
++      struct vcd_disk *vcd;
++      int ret = 0;
++
++      if ((vcd = xencdrom_get_list_entry(info->gd))) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++              ret = cdrom_release(&vcd->vcd_cdrom_info, file);
++#else
++              cdrom_release(&vcd->vcd_cdrom_info, mode);
++#endif
++              spin_unlock(&vcd->vcd_cdrom_info_lock);
++              if (vcd->vcd_cdrom_info.use_count == 0) {
++                      info->users = 1;
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++                      blkif_release(inode, file);
++#else
++                      blkif_release(gd, mode);
++#endif
++              }
++      }
++      return ret;
++}
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++static int xencdrom_block_ioctl(struct inode *inode, struct file *file,
++                              unsigned cmd, unsigned long arg)
++{
++      struct block_device *bd = inode->i_bdev;
++#else
++static int xencdrom_block_ioctl(struct block_device *bd, fmode_t mode,
++                              unsigned cmd, unsigned long arg)
++{
++#endif
++      struct blkfront_info *info = bd->bd_disk->private_data;
++      struct vcd_disk *vcd;
++      int ret = 0;
++
++      if (!(vcd = xencdrom_get_list_entry(info->gd)))
++              goto out;
++
++      switch (cmd) {
++      case 2285: /* SG_IO */
++              ret = -ENOSYS;
++              break;
++      case CDROMEJECT:
++              ret = xencdrom_tray_move(&vcd->vcd_cdrom_info, 1);
++              break;
++      case CDROMCLOSETRAY:
++              ret = xencdrom_tray_move(&vcd->vcd_cdrom_info, 0);
++              break;
++      case CDROM_GET_CAPABILITY:
++              ret = vcd->vcd_cdrom_info.ops->capability & ~vcd->vcd_cdrom_info.mask;
++              break;
++      case CDROM_SET_OPTIONS:
++              ret = vcd->vcd_cdrom_info.options;
++              break;
++      case CDROM_SEND_PACKET:
++              ret = submit_cdrom_cmd(info, (struct packet_command *)arg);
++              break;
++      default:
++              spin_unlock(&vcd->vcd_cdrom_info_lock);
++out:
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
++              return blkif_ioctl(inode, file, cmd, arg);
++#else
++              return blkif_ioctl(bd, mode, cmd, arg);
++#endif
++      }
++      spin_unlock(&vcd->vcd_cdrom_info_lock);
++      return ret;
++}
++
++/* Called as result of cdrom_open, vcd_cdrom_info_lock already held */
++static int xencdrom_block_media_changed(struct gendisk *disk)
++{
++      struct vcd_disk *vcd;
++      struct vcd_disk *ret_vcd = NULL;
++      int ret = 0;
++
++      spin_lock(&vcd_disks_lock);
++      list_for_each_entry(vcd, &vcd_disks, vcd_entry) {
++              if (vcd->vcd_cdrom_info.disk == disk) {
++                      ret_vcd = vcd;
++                      break;
++              }
++      }
++      spin_unlock(&vcd_disks_lock);
++      if (ret_vcd) {
++              ret = cdrom_media_changed(&ret_vcd->vcd_cdrom_info);
++      }
++      return ret;
++}
++
++static const struct block_device_operations xencdrom_bdops =
++{
++      .owner          = THIS_MODULE,
++      .open           = xencdrom_block_open,
++      .release        = xencdrom_block_release,
++      .ioctl          = xencdrom_block_ioctl,
++      .media_changed  = xencdrom_block_media_changed,
++};
++
++void register_vcd(struct blkfront_info *info)
++{
++      struct gendisk *gd = info->gd;
++      struct vcd_disk *vcd;
++
++      /* Make sure this is for a CD device */
++      if (!(gd->flags & GENHD_FL_CD))
++              goto out;
++
++      /* Make sure we have backend support */
++      if (!xencdrom_supported(info)) {
++              goto out;
++      }
++
++      /* Create new vcd_disk and fill in cdrom_info */
++      vcd = (struct vcd_disk *)kzalloc(sizeof(struct vcd_disk), GFP_KERNEL);
++      if (!vcd) {
++              pr_info("%s(): Unable to allocate vcd struct!\n", __func__);
++              goto out;
++      }
++      spin_lock_init(&vcd->vcd_cdrom_info_lock);
++
++      vcd->vcd_cdrom_info.ops = &xencdrom_dops;
++      vcd->vcd_cdrom_info.speed = 4;
++      vcd->vcd_cdrom_info.capacity = 1;
++      vcd->vcd_cdrom_info.options     = 0;
++      strcpy(vcd->vcd_cdrom_info.name, gd->disk_name);
++      vcd->vcd_cdrom_info.mask = (CDC_CD_RW | CDC_DVD_R | CDC_DVD_RAM |
++                      CDC_SELECT_DISC | CDC_SELECT_SPEED |
++                      CDC_MRW | CDC_MRW_W | CDC_RAM);
++
++      if (register_cdrom(&(vcd->vcd_cdrom_info)) != 0) {
++              pr_warn("%s() Cannot register blkdev as a cdrom %d!\n",
++                      __func__, gd->major);
++              goto err_out;
++      }
++      gd->fops = &xencdrom_bdops;
++      vcd->vcd_cdrom_info.disk = gd;
++
++      spin_lock(&vcd_disks_lock);
++      list_add(&(vcd->vcd_entry), &vcd_disks);
++      spin_unlock(&vcd_disks_lock);
++out:
++      return;
++err_out:
++      kfree(vcd);
++}
++
++void unregister_vcd(struct blkfront_info *info) {
++      struct gendisk *gd = info->gd;
++      struct vcd_disk *vcd;
++
++      spin_lock(&vcd_disks_lock);
++      list_for_each_entry(vcd, &vcd_disks, vcd_entry) {
++              if (vcd->vcd_cdrom_info.disk == gd) {
++                      spin_lock(&vcd->vcd_cdrom_info_lock);
++                      unregister_cdrom(&vcd->vcd_cdrom_info);
++                      list_del(&vcd->vcd_entry);
++                      spin_unlock(&vcd->vcd_cdrom_info_lock);
++                      kfree(vcd);
++                      break;
++              }
++      }
++      spin_unlock(&vcd_disks_lock);
++}
++
diff --cc drivers/xen/blktap/Makefile

index 0000000,0000000..b1e4a07

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap/Makefile
@@@ -1,0 -1,0 +1,5 @@@
++LINUXINCLUDE += -I../xen/include/public/io
++
++obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
++
++blktap-y := xenbus.o interface.o blocktap.o
diff --cc drivers/xen/blktap/blktap.c

index 0000000,0000000..04f12d9

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap/blktap.c
@@@ -1,0 -1,0 +1,1781 @@@
++/******************************************************************************
++ * drivers/xen/blktap/blktap.c
++ * 
++ * Back-end driver for user level virtual block devices. This portion of the
++ * driver exports a 'unified' block-device interface that can be accessed
++ * by any operating system that implements a compatible front end. Requests
++ * are remapped to a user-space memory region.
++ *
++ * Based on the blkback driver code.
++ * 
++ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
++ *
++ * Clean ups and fix ups:
++ *    Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/spinlock.h>
++#include <linux/kthread.h>
++#include <linux/freezer.h>
++#include <linux/list.h>
++#include <asm/hypervisor.h>
++#include "common.h"
++#include <xen/balloon.h>
++#include <xen/driver_util.h>
++#include <xen/evtchn.h>
++#include <xen/gnttab.h>
++#include <linux/kernel.h>
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/gfp.h>
++#include <linux/poll.h>
++#include <linux/delay.h>
++#include <linux/nsproxy.h>
++#include <asm/tlbflush.h>
++
++#define MAX_TAP_DEV 256     /*the maximum number of tapdisk ring devices    */
++#define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
++
++/*
++ * The maximum number of requests that can be outstanding at any time
++ * is determined by 
++ *
++ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
++ *
++ * where mmap_alloc < MAX_DYNAMIC_MEM.
++ *
++ * TODO:
++ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
++ * sysfs.
++ */
++#define BLK_RING_SIZE         __CONST_RING_SIZE(blkif, PAGE_SIZE)
++#define MAX_DYNAMIC_MEM               BLK_RING_SIZE
++#define MAX_PENDING_REQS      BLK_RING_SIZE
++#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
++#define MMAP_VADDR(_start, _req,_seg)                                   \
++        (_start +                                                       \
++         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
++         ((_seg) * PAGE_SIZE))
++static int mmap_pages = MMAP_PAGES;
++
++#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
++                    * have a bunch of pages reserved for shared
++                    * memory rings.
++                    */
++
++/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
++typedef struct domid_translate {
++      unsigned short domid;
++      unsigned short busid;
++} domid_translate_t ;
++
++typedef struct domid_translate_ext {
++      unsigned short domid;
++      u32 busid;
++} domid_translate_ext_t ;
++
++/*Data struct associated with each of the tapdisk devices*/
++typedef struct tap_blkif {
++      struct mm_struct *mm;         /*User address space                   */
++      unsigned long rings_vstart;   /*Kernel memory mapping                */
++      unsigned long user_vstart;    /*User memory mapping                  */
++      unsigned long dev_inuse;      /*One process opens device at a time.  */
++      unsigned long dev_pending;    /*In process of being opened           */
++      unsigned long ring_ok;        /*make this ring->state                */
++      blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
++      wait_queue_head_t wait;       /*for poll                             */
++      unsigned long mode;           /*current switching mode               */
++      int minor;                    /*Minor number for tapdisk device      */
++      pid_t pid;                    /*tapdisk process id                   */
++      struct pid_namespace *pid_ns; /*... and its corresponding namespace  */
++      enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
++                                                shutdown                   */
++      struct idx_map {
++              u16 mem, req;
++      } *idx_map;                   /*Record the user ring id to kern
++                                      [req id, idx] tuple                  */
++      blkif_t *blkif;               /*Associate blkif with tapdev          */
++      struct domid_translate_ext trans; /*Translation from domid to bus.   */
++      struct vm_foreign_map foreign_map;    /*Mapping page */
++} tap_blkif_t;
++
++static struct tap_blkif *tapfds[MAX_TAP_DEV];
++static int blktap_next_minor;
++
++/* Run-time switchable: /sys/module/blktap/parameters/ */
++static unsigned int log_stats = 0;
++static unsigned int debug_lvl = 0;
++module_param(log_stats, int, 0644);
++module_param(debug_lvl, int, 0644);
++
++/*
++ * Each outstanding request that we've passed to the lower device layers has a 
++ * 'pending_req' allocated to it.
++ */
++typedef struct {
++      blkif_t       *blkif;
++      u64            id;
++      unsigned short mem_idx;
++      unsigned short nr_pages;
++      struct list_head free_list;
++} pending_req_t;
++
++static pending_req_t *pending_reqs[MAX_PENDING_REQS];
++static struct list_head pending_free;
++static DEFINE_SPINLOCK(pending_free_lock);
++static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
++static int alloc_pending_reqs;
++
++static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
++      return (req - pending_reqs[idx]);
++}
++
++#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
++
++#define BLKBACK_INVALID_HANDLE (~0)
++
++static struct page **foreign_pages[MAX_DYNAMIC_MEM];
++static inline struct page *idx_to_page(
++      unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
++{
++      unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
++      return foreign_pages[mmap_idx][arr_idx];
++}
++static inline unsigned long idx_to_kaddr(
++      unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
++{
++      unsigned long pfn = page_to_pfn(idx_to_page(mmap_idx,req_idx,sg_idx));
++      return (unsigned long)pfn_to_kaddr(pfn);
++}
++
++static unsigned short mmap_alloc = 0;
++static unsigned short mmap_lock = 0;
++static unsigned short mmap_inuse = 0;
++
++/******************************************************************
++ * GRANT HANDLES
++ */
++
++/* When using grant tables to map a frame for device access then the
++ * handle returned must be used to unmap the frame. This is needed to
++ * drop the ref count on the frame.
++ */
++struct grant_handle_pair
++{
++        grant_handle_t kernel;
++        grant_handle_t user;
++};
++#define INVALID_GRANT_HANDLE  0xFFFF
++
++static struct grant_handle_pair 
++    pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
++#define pending_handle(_id, _idx, _i) \
++    (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
++    + (_i)])
++
++
++static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
++
++#define BLKTAP_MINOR 0  /*/dev/xen/blktap has a dynamic major */
++#define BLKTAP_DEV_DIR  "/dev/xen"
++
++static int blktap_major;
++
++/* blktap IOCTLs: */
++#define BLKTAP_IOCTL_KICK_FE         1
++#define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
++#define BLKTAP_IOCTL_SETMODE         3
++#define BLKTAP_IOCTL_SENDPID       4
++#define BLKTAP_IOCTL_NEWINTF       5
++#define BLKTAP_IOCTL_MINOR         6
++#define BLKTAP_IOCTL_MAJOR         7
++#define BLKTAP_QUERY_ALLOC_REQS      8
++#define BLKTAP_IOCTL_FREEINTF        9
++#define BLKTAP_IOCTL_NEWINTF_EXT     50
++#define BLKTAP_IOCTL_PRINT_IDXS      100  
++
++/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
++#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
++#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
++#define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp.             */
++
++#define BLKTAP_MODE_INTERPOSE \
++           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
++
++
++static inline int BLKTAP_MODE_VALID(unsigned long arg)
++{
++      return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
++              (arg == BLKTAP_MODE_INTERCEPT_FE) ||
++                (arg == BLKTAP_MODE_INTERPOSE   ));
++}
++
++/* Requests passing through the tap to userspace are re-assigned an ID.
++ * We must record a mapping between the BE [IDX,ID] tuple and the userspace
++ * ring ID. 
++ */
++
++#define INVALID_MIDX 0xdead
++
++/*TODO: Convert to a free list*/
++static inline unsigned int GET_NEXT_REQ(const struct idx_map *idx_map)
++{
++      unsigned int i;
++
++      for (i = 0; i < MAX_PENDING_REQS; i++)
++              if (idx_map[i].mem == INVALID_MIDX)
++                      break;
++
++      return i;
++}
++
++static inline unsigned int OFFSET_TO_USR_IDX(unsigned long offset)
++{
++      return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
++}
++
++static inline unsigned int OFFSET_TO_SEG(unsigned long offset)
++{
++      return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
++}
++
++
++#define BLKTAP_INVALID_HANDLE(_g) \
++    (((_g->kernel) == INVALID_GRANT_HANDLE) &&  \
++     ((_g->user) == INVALID_GRANT_HANDLE))
++
++#define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
++    (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
++    } while(0)
++
++
++static char *blktap_devnode(struct device *dev, mode_t *mode)
++{
++      return kasprintf(GFP_KERNEL, "xen/blktap%u", MINOR(dev->devt));
++}
++
++static struct device_type blktap_type = {
++      .devnode = blktap_devnode
++};
++
++/******************************************************************
++ * BLKTAP VM OPS
++ */
++
++static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++      /*
++       * if the page has not been mapped in by the driver then return
++       * VM_FAULT_SIGBUS to the domain.
++       */
++
++      return VM_FAULT_SIGBUS;
++}
++
++static pte_t blktap_clear_pte(struct vm_area_struct *vma,
++                            unsigned long uvaddr,
++                            pte_t *ptep, int is_fullmm)
++{
++      pte_t copy;
++      tap_blkif_t *info = NULL;
++      unsigned int seg, usr_idx, pending_idx, mmap_idx, count = 0;
++      unsigned long offset, uvstart = 0;
++      struct page *pg;
++      struct grant_handle_pair *khandle;
++      struct gnttab_unmap_grant_ref unmap[2];
++
++      /*
++       * If the address is before the start of the grant mapped region or
++       * if vm_file is NULL (meaning mmap failed and we have nothing to do)
++       */
++      if (vma->vm_file != NULL) {
++              info = vma->vm_file->private_data;
++              uvstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
++      }
++      if (vma->vm_file == NULL || uvaddr < uvstart)
++              return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
++                                                 is_fullmm);
++
++      /* TODO Should these be changed to if statements? */
++      BUG_ON(!info);
++      BUG_ON(!info->idx_map);
++
++      offset = (uvaddr - uvstart) >> PAGE_SHIFT;
++      usr_idx = OFFSET_TO_USR_IDX(offset);
++      seg = OFFSET_TO_SEG(offset);
++
++      pending_idx = info->idx_map[usr_idx].req;
++      mmap_idx = info->idx_map[usr_idx].mem;
++
++      pg = idx_to_page(mmap_idx, pending_idx, seg);
++      ClearPageReserved(pg);
++      info->foreign_map.map[offset + RING_PAGES] = NULL;
++
++      khandle = &pending_handle(mmap_idx, pending_idx, seg);
++
++      if (khandle->kernel != INVALID_GRANT_HANDLE) {
++              unsigned long pfn = page_to_pfn(pg);
++
++              gnttab_set_unmap_op(&unmap[count],
++                                  (unsigned long)pfn_to_kaddr(pfn),
++                                  GNTMAP_host_map, khandle->kernel);
++              count++;
++
++              set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
++      }
++
++      if (khandle->user != INVALID_GRANT_HANDLE) {
++              BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++
++              copy = *ptep;
++              gnttab_set_unmap_op(&unmap[count], ptep_to_machine(ptep),
++                                  GNTMAP_host_map 
++                                  | GNTMAP_application_map 
++                                  | GNTMAP_contains_pte,
++                                  khandle->user);
++              count++;
++      } else {
++              BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
++
++              /* USING SHADOW PAGE TABLES. */
++              copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
++                                                 is_fullmm);
++      }
++
++      if (count) {
++              BLKTAP_INVALIDATE_HANDLE(khandle);
++              if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++                                            unmap, count))
++                      BUG();
++      }
++
++      return copy;
++}
++
++static void blktap_vma_open(struct vm_area_struct *vma)
++{
++      tap_blkif_t *info;
++      if (vma->vm_file == NULL)
++              return;
++
++      info = vma->vm_file->private_data;
++      vma->vm_private_data =
++              &info->foreign_map.map[(vma->vm_start - info->rings_vstart) >> PAGE_SHIFT];
++}
++
++/* tricky part
++ * When partial munmapping, ->open() is called only splitted vma which
++ * will be released soon. * See split_vma() and do_munmap() in mm/mmap.c
++ * So there is no chance to fix up vm_private_data of the end vma.
++ */
++static void blktap_vma_close(struct vm_area_struct *vma)
++{
++      tap_blkif_t *info;
++      struct vm_area_struct *next = vma->vm_next;
++
++      if (next == NULL ||
++          vma->vm_ops != next->vm_ops ||
++          vma->vm_end != next->vm_start ||
++          vma->vm_file == NULL ||
++          vma->vm_file != next->vm_file)
++              return;
++
++      info = vma->vm_file->private_data;
++      next->vm_private_data =
++              &info->foreign_map.map[(next->vm_start - info->rings_vstart) >> PAGE_SHIFT];
++}
++
++static struct vm_operations_struct blktap_vm_ops = {
++      fault:    blktap_fault,
++      zap_pte:  blktap_clear_pte,
++      open:     blktap_vma_open,
++      close:    blktap_vma_close,
++};
++
++/******************************************************************
++ * BLKTAP FILE OPS
++ */
++ 
++/*Function Declarations*/
++static tap_blkif_t *get_next_free_dev(void);
++static int blktap_open(struct inode *inode, struct file *filp);
++static int blktap_release(struct inode *inode, struct file *filp);
++static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
++static long blktap_ioctl(struct file *filp, unsigned int cmd,
++                       unsigned long arg);
++static unsigned int blktap_poll(struct file *file, poll_table *wait);
++
++static const struct file_operations blktap_fops = {
++      .owner   = THIS_MODULE,
++      .poll    = blktap_poll,
++      .unlocked_ioctl = blktap_ioctl,
++      .open    = blktap_open,
++      .release = blktap_release,
++      .llseek  = no_llseek,
++      .mmap    = blktap_mmap,
++};
++
++
++static tap_blkif_t *get_next_free_dev(void)
++{
++      tap_blkif_t *info;
++      int minor;
++
++      /*
++       * This is called only from the ioctl, which
++       * means we should always have interrupts enabled.
++       */
++      BUG_ON(irqs_disabled());
++
++      spin_lock_irq(&pending_free_lock);
++
++      /* tapfds[0] is always NULL */
++
++      for (minor = 1; minor < blktap_next_minor; minor++) {
++              info = tapfds[minor];
++              /* we could have failed a previous attempt. */
++              if (!info ||
++                  ((!test_bit(0, &info->dev_inuse)) &&
++                   (info->dev_pending == 0)) ) {
++                      info->dev_pending = 1;
++                      goto found;
++              }
++      }
++      info = NULL;
++      minor = -1;
++
++      /*
++       * We didn't find free device. If we can still allocate
++       * more, then we grab the next device minor that is
++       * available.  This is done while we are still under
++       * the protection of the pending_free_lock.
++       */
++      if (blktap_next_minor < MAX_TAP_DEV)
++              minor = blktap_next_minor++;
++found:
++      spin_unlock_irq(&pending_free_lock);
++
++      if (!info && minor > 0) {
++              info = kzalloc(sizeof(*info), GFP_KERNEL);
++              if (unlikely(!info)) {
++                      /*
++                       * If we failed here, try to put back
++                       * the next minor number. But if one
++                       * was just taken, then we just lose this
++                       * minor.  We can try to allocate this
++                       * minor again later.
++                       */
++                      spin_lock_irq(&pending_free_lock);
++                      if (blktap_next_minor == minor+1)
++                              blktap_next_minor--;
++                      spin_unlock_irq(&pending_free_lock);
++                      goto out;
++              }
++
++              info->minor = minor;
++              /*
++               * Make sure that we have a minor before others can
++               * see us.
++               */
++              wmb();
++              tapfds[minor] = info;
++
++              xen_class_device_create(&blktap_type, NULL,
++                                      MKDEV(blktap_major, minor),
++                                      NULL, "blktap%d", minor);
++      }
++
++out:
++      return info;
++}
++
++int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
++{
++      tap_blkif_t *info;
++      int i;
++
++      for (i = 1; i < blktap_next_minor; i++) {
++              info = tapfds[i];
++              if ( info &&
++                   (info->trans.domid == domid) &&
++                   (info->trans.busid == xenbus_id) ) {
++                      info->blkif = blkif;
++                      info->status = RUNNING;
++                      return i;
++              }
++      }
++      return -1;
++}
++
++void signal_tapdisk(int idx) 
++{
++      tap_blkif_t *info;
++      struct task_struct *ptask;
++      struct mm_struct *mm;
++
++      /*
++       * if the userland tools set things up wrong, this could be negative;
++       * just don't try to signal in this case
++       */
++      if (idx < 0 || idx >= MAX_TAP_DEV)
++              return;
++
++      info = tapfds[idx];
++      if (!info)
++              return;
++
++      if (info->pid > 0) {
++              ptask = pid_task(find_pid_ns(info->pid, info->pid_ns),
++                               PIDTYPE_PID);
++              if (ptask)
++                      info->status = CLEANSHUTDOWN;
++      }
++      info->blkif = NULL;
++
++      mm = xchg(&info->mm, NULL);
++      if (mm)
++              mmput(mm);
++}
++
++static int blktap_open(struct inode *inode, struct file *filp)
++{
++      blkif_sring_t *sring;
++      int idx = iminor(inode) - BLKTAP_MINOR;
++      tap_blkif_t *info;
++      int i;
++      
++      nonseekable_open(inode, filp);
++
++      /* ctrl device, treat differently */
++      if (!idx)
++              return 0;
++      if (idx < 0 || idx >= MAX_TAP_DEV) {
++              WPRINTK("No device /dev/xen/blktap%d\n", idx);
++              return -ENODEV;
++      }
++
++      info = tapfds[idx];
++      if (!info) {
++              WPRINTK("Unable to open device /dev/xen/blktap%d\n",
++                      idx);
++              return -ENODEV;
++      }
++
++      DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
++      
++      /*Only one process can access device at a time*/
++      if (test_and_set_bit(0, &info->dev_inuse))
++              return -EBUSY;
++
++      info->dev_pending = 0;
++          
++      /* Allocate the fe ring. */
++      sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
++      if (sring == NULL)
++              goto fail_nomem;
++
++      SetPageReserved(virt_to_page(sring));
++    
++      SHARED_RING_INIT(sring);
++      FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
++      
++      filp->private_data = info;
++      info->mm = NULL;
++
++      info->idx_map = kmalloc(sizeof(*info->idx_map) * MAX_PENDING_REQS,
++                              GFP_KERNEL);
++      
++      if (info->idx_map == NULL)
++              goto fail_nomem;
++
++      if (idx > 0) {
++              init_waitqueue_head(&info->wait);
++              for (i = 0; i < MAX_PENDING_REQS; i++) {
++                      info->idx_map[i].mem = INVALID_MIDX;
++                      info->idx_map[i].req = ~0;
++              }
++      }
++
++      DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
++      return 0;
++
++ fail_nomem:
++      return -ENOMEM;
++}
++
++static int blktap_release(struct inode *inode, struct file *filp)
++{
++      tap_blkif_t *info = filp->private_data;
++      struct mm_struct *mm;
++      
++      /* check for control device */
++      if (!info)
++              return 0;
++
++      info->ring_ok = 0;
++      smp_wmb();
++
++      mm = xchg(&info->mm, NULL);
++      if (mm)
++              mmput(mm);
++      kfree(info->foreign_map.map);
++      info->foreign_map.map = NULL;
++
++      /* Free the ring page. */
++      ClearPageReserved(virt_to_page(info->ufe_ring.sring));
++      free_page((unsigned long) info->ufe_ring.sring);
++
++      if (info->idx_map) {
++              kfree(info->idx_map);
++              info->idx_map = NULL;
++      }
++
++      if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
++              if (info->blkif->xenblkd != NULL) {
++                      kthread_stop(info->blkif->xenblkd);
++                      info->blkif->xenblkd = NULL;
++              }
++              info->status = CLEANSHUTDOWN;
++      }
++
++      clear_bit(0, &info->dev_inuse);
++      DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
++
++      return 0;
++}
++
++
++/* Note on mmap:
++ * We need to map pages to user space in a way that will allow the block
++ * subsystem set up direct IO to them.  This couldn't be done before, because
++ * there isn't really a sane way to translate a user virtual address down to a 
++ * physical address when the page belongs to another domain.
++ *
++ * My first approach was to map the page in to kernel memory, add an entry
++ * for it in the physical frame list (using alloc_lomem_region as in blkback)
++ * and then attempt to map that page up to user space.  This is disallowed
++ * by xen though, which realizes that we don't really own the machine frame
++ * underlying the physical page.
++ *
++ * The new approach is to provide explicit support for this in xen linux.
++ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
++ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
++ * from pages to actual page structs.  There is a new clause in get_user_pages
++ * that does the right thing for this sort of mapping.
++ */
++static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
++{
++      int size;
++      tap_blkif_t *info = filp->private_data;
++      int ret;
++
++      if (info == NULL) {
++              WPRINTK("blktap: mmap, retrieving idx failed\n");
++              return -ENOMEM;
++      }
++      
++      vma->vm_flags |= VM_RESERVED;
++      vma->vm_ops = &blktap_vm_ops;
++
++      size = vma->vm_end - vma->vm_start;
++      if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
++              WPRINTK("you _must_ map exactly %d pages!\n",
++                     mmap_pages + RING_PAGES);
++              return -EAGAIN;
++      }
++
++      size >>= PAGE_SHIFT;
++      info->rings_vstart = vma->vm_start;
++      info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
++    
++      /* Map the ring pages to the start of the region and reserve it. */
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              ret = vm_insert_page(vma, vma->vm_start,
++                                   virt_to_page(info->ufe_ring.sring));
++      else
++              ret = remap_pfn_range(vma, vma->vm_start,
++                                    __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
++                                    PAGE_SIZE, vma->vm_page_prot);
++      if (ret) {
++              WPRINTK("Mapping user ring failed!\n");
++              goto fail;
++      }
++
++      /* Mark this VM as containing foreign pages, and set up mappings. */
++      info->foreign_map.map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) *
++                          sizeof(*info->foreign_map.map), GFP_KERNEL);
++      if (info->foreign_map.map == NULL) {
++              WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
++              goto fail;
++      }
++
++      vma->vm_private_data = &info->foreign_map;
++      vma->vm_flags |= VM_FOREIGN;
++      vma->vm_flags |= VM_DONTCOPY;
++
++#ifdef CONFIG_X86
++      vma->vm_mm->context.has_foreign_mappings = 1;
++#endif
++
++      info->mm = get_task_mm(current);
++      smp_wmb();
++      info->ring_ok = 1;
++      return 0;
++ fail:
++      /* Clear any active mappings. */
++      zap_page_range(vma, vma->vm_start, 
++                     vma->vm_end - vma->vm_start, NULL);
++
++      return -ENOMEM;
++}
++
++
++static long blktap_ioctl(struct file *filp, unsigned int cmd,
++                       unsigned long arg)
++{
++      tap_blkif_t *info = filp->private_data;
++
++      switch(cmd) {
++      case BLKTAP_IOCTL_KICK_FE: 
++      {
++              /* There are fe messages to process. */
++              return blktap_read_ufe_ring(info);
++      }
++      case BLKTAP_IOCTL_SETMODE:
++      {
++              if (info) {
++                      if (BLKTAP_MODE_VALID(arg)) {
++                              info->mode = arg;
++                              /* XXX: may need to flush rings here. */
++                              DPRINTK("blktap: set mode to %lx\n", 
++                                     arg);
++                              return 0;
++                      }
++              }
++              return 0;
++      }
++      case BLKTAP_IOCTL_PRINT_IDXS:
++        {
++              if (info) {
++                      pr_info("User Rings: \n-----------\n");
++                      pr_info("UF: rsp_cons: %2d, req_prod_prv: %2d "
++                              "| req_prod: %2d, rsp_prod: %2d\n",
++                              info->ufe_ring.rsp_cons,
++                              info->ufe_ring.req_prod_pvt,
++                              info->ufe_ring.sring->req_prod,
++                              info->ufe_ring.sring->rsp_prod);
++              }
++              return 0;
++        }
++      case BLKTAP_IOCTL_SENDPID:
++      {
++              if (info) {
++                      info->pid = (pid_t)arg;
++                      info->pid_ns = current->nsproxy->pid_ns;
++                      DPRINTK("blktap: pid received %p:%d\n",
++                              info->pid_ns, info->pid);
++              }
++              return 0;
++      }
++      case BLKTAP_IOCTL_NEWINTF:
++      {               
++              uint64_t val = (uint64_t)arg;
++              domid_translate_t *tr = (domid_translate_t *)&val;
++
++              DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
++                     tr->domid, tr->busid);
++              info = get_next_free_dev();
++              if (!info) {
++                      WPRINTK("Error initialising /dev/xen/blktap - "
++                              "No more devices\n");
++                      return -1;
++              }
++              info->trans.domid = tr->domid;
++              info->trans.busid = tr->busid;
++              return info->minor;
++      }
++      case BLKTAP_IOCTL_NEWINTF_EXT:
++      {
++              void __user *udata = (void __user *) arg;
++              domid_translate_ext_t tr;
++
++              if (copy_from_user(&tr, udata, sizeof(domid_translate_ext_t)))
++                      return -EFAULT;
++
++              DPRINTK("NEWINTF_EXT Req for domid %d and bus id %d\n", 
++                     tr.domid, tr.busid);
++              info = get_next_free_dev();
++              if (!info) {
++                      WPRINTK("Error initialising /dev/xen/blktap - "
++                              "No more devices\n");
++                      return -1;
++              }
++              info->trans.domid = tr.domid;
++              info->trans.busid = tr.busid;
++              return info->minor;
++      }
++      case BLKTAP_IOCTL_FREEINTF:
++      {
++              unsigned long dev = arg;
++              unsigned long flags;
++
++              if (info || dev >= MAX_TAP_DEV)
++                      return -EINVAL;
++
++              info = tapfds[dev];
++              if (!info)
++                      return 0; /* should this be an error? */
++
++              spin_lock_irqsave(&pending_free_lock, flags);
++              if (info->dev_pending)
++                      info->dev_pending = 0;
++              spin_unlock_irqrestore(&pending_free_lock, flags);
++
++              return 0;
++      }
++      case BLKTAP_IOCTL_MINOR:
++              if (!info) {
++                      unsigned long dev = arg;
++
++                      if (dev >= MAX_TAP_DEV)
++                              return -EINVAL;
++
++                      info = tapfds[dev];
++                      if (!info)
++                              return -EINVAL;
++              }
++
++              return info->minor;
++
++      case BLKTAP_IOCTL_MAJOR:
++              return blktap_major;
++
++      case BLKTAP_QUERY_ALLOC_REQS:
++              WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%lu\n",
++                      alloc_pending_reqs, MAX_PENDING_REQS);
++              return (alloc_pending_reqs/MAX_PENDING_REQS) * 100;
++      }
++      return -ENOIOCTLCMD;
++}
++
++static unsigned int blktap_poll(struct file *filp, poll_table *wait)
++{
++      tap_blkif_t *info = filp->private_data;
++      
++      /* do not work on the control device */
++      if (!info)
++              return 0;
++
++      poll_wait(filp, &info->wait, wait);
++      if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
++              RING_PUSH_REQUESTS(&info->ufe_ring);
++              return POLLIN | POLLRDNORM;
++      }
++      return 0;
++}
++
++static void blktap_kick_user(int idx)
++{
++      tap_blkif_t *info;
++
++      if (idx < 0 || idx >= MAX_TAP_DEV)
++              return;
++
++      info = tapfds[idx];
++      if (!info)
++              return;
++
++      wake_up_interruptible(&info->wait);
++
++      return;
++}
++
++static int do_block_io_op(blkif_t *blkif);
++static void dispatch_rw_block_io(blkif_t *blkif,
++                               blkif_request_t *req,
++                               pending_req_t *pending_req);
++static void make_response(blkif_t *blkif, u64 id,
++                          unsigned short op, int st);
++
++/******************************************************************
++ * misc small helpers
++ */
++static int req_increase(void)
++{
++      int i, j;
++
++      if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) 
++              return -EINVAL;
++
++      pending_reqs[mmap_alloc]  = kzalloc(sizeof(pending_req_t)
++                                          * MAX_PENDING_REQS, GFP_KERNEL);
++      foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
++
++      if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
++              goto out_of_memory;
++
++      DPRINTK("%s: reqs=%lu, pages=%d\n",
++              __FUNCTION__, MAX_PENDING_REQS, mmap_pages);
++
++      for (i = 0; i < MAX_PENDING_REQS; i++) {
++              list_add_tail(&pending_reqs[mmap_alloc][i].free_list, 
++                            &pending_free);
++              pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
++              for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
++                      BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, 
++                                                               i, j));
++      }
++
++      mmap_alloc++;
++      DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
++      return 0;
++
++ out_of_memory:
++      free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
++      kfree(pending_reqs[mmap_alloc]);
++      WPRINTK("%s: out of memory\n", __FUNCTION__);
++      return -ENOMEM;
++}
++
++static void mmap_req_del(int mmap)
++{
++      assert_spin_locked(&pending_free_lock);
++
++      kfree(pending_reqs[mmap]);
++      pending_reqs[mmap] = NULL;
++
++      free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
++      foreign_pages[mmap] = NULL;
++
++      mmap_lock = 0;
++      DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
++      mmap_alloc--;
++}
++
++static pending_req_t* alloc_req(void)
++{
++      pending_req_t *req = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&pending_free_lock, flags);
++
++      if (!list_empty(&pending_free)) {
++              req = list_entry(pending_free.next, pending_req_t, free_list);
++              list_del(&req->free_list);
++      }
++
++      if (req)
++              alloc_pending_reqs++;
++      spin_unlock_irqrestore(&pending_free_lock, flags);
++
++      return req;
++}
++
++static void free_req(pending_req_t *req)
++{
++      unsigned long flags;
++      int was_empty;
++
++      spin_lock_irqsave(&pending_free_lock, flags);
++
++      alloc_pending_reqs--;
++      if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
++              mmap_inuse--;
++              if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
++              spin_unlock_irqrestore(&pending_free_lock, flags);
++              return;
++      }
++      was_empty = list_empty(&pending_free);
++      list_add(&req->free_list, &pending_free);
++
++      spin_unlock_irqrestore(&pending_free_lock, flags);
++
++      if (was_empty)
++              wake_up(&pending_free_wq);
++}
++
++static void blktap_zap_page_range(struct mm_struct *mm,
++                                unsigned long uvaddr, int nr_pages)
++{
++      unsigned long end = uvaddr + (nr_pages << PAGE_SHIFT);
++      struct vm_area_struct *vma;
++
++      vma = find_vma(mm, uvaddr);
++      while (vma && uvaddr < end) {
++              unsigned long s = max(uvaddr, vma->vm_start);
++              unsigned long e = min(end, vma->vm_end);
++
++              zap_page_range(vma, s, e - s, NULL);
++
++              uvaddr = e;
++              vma = vma->vm_next;
++      }
++}
++
++static void fast_flush_area(pending_req_t *req, unsigned int k_idx,
++                           unsigned int u_idx, int tapidx)
++{
++      struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
++      unsigned int i, mmap_idx, invcount = 0, locked = 0;
++      struct grant_handle_pair *khandle;
++      uint64_t ptep;
++      int ret;
++      unsigned long uvaddr;
++      tap_blkif_t *info;
++      struct mm_struct *mm;
++      
++
++      if ((tapidx < 0) || (tapidx >= MAX_TAP_DEV)
++          || !(info = tapfds[tapidx])) {
++              WPRINTK("fast_flush: Couldn't get info!\n");
++              return;
++      }
++
++      mm = info->mm;
++
++      if (mm != NULL && xen_feature(XENFEAT_auto_translated_physmap)) {
++              down_write(&mm->mmap_sem);
++              blktap_zap_page_range(mm,
++                                    MMAP_VADDR(info->user_vstart, u_idx, 0),
++                                    req->nr_pages);
++              up_write(&mm->mmap_sem);
++              return;
++      }
++
++      mmap_idx = req->mem_idx;
++
++      for (i = 0; i < req->nr_pages; i++) {
++              uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
++
++              khandle = &pending_handle(mmap_idx, k_idx, i);
++
++              if (khandle->kernel != INVALID_GRANT_HANDLE) {
++                      gnttab_set_unmap_op(&unmap[invcount],
++                                          idx_to_kaddr(mmap_idx, k_idx, i),
++                                          GNTMAP_host_map, khandle->kernel);
++                      invcount++;
++
++                      set_phys_to_machine(
++                              page_to_pfn(idx_to_page(mmap_idx, k_idx, i)),
++                              INVALID_P2M_ENTRY);
++              }
++
++              if (mm != NULL && khandle->user != INVALID_GRANT_HANDLE) {
++                      BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++                      if (!locked++)
++                              down_write(&mm->mmap_sem);
++                      if (create_lookup_pte_addr(
++                              mm,
++                              MMAP_VADDR(info->user_vstart, u_idx, i),
++                              &ptep) !=0) {
++                              up_write(&mm->mmap_sem);
++                              WPRINTK("Couldn't get a pte addr!\n");
++                              return;
++                      }
++
++                      gnttab_set_unmap_op(&unmap[invcount], ptep,
++                                          GNTMAP_host_map
++                                          | GNTMAP_application_map
++                                          | GNTMAP_contains_pte,
++                                          khandle->user);
++                      invcount++;
++              }
++
++              BLKTAP_INVALIDATE_HANDLE(khandle);
++      }
++      ret = HYPERVISOR_grant_table_op(
++              GNTTABOP_unmap_grant_ref, unmap, invcount);
++      BUG_ON(ret);
++      
++      if (mm != NULL && !xen_feature(XENFEAT_auto_translated_physmap)) {
++              if (!locked++)
++                      down_write(&mm->mmap_sem);
++              blktap_zap_page_range(mm, 
++                                    MMAP_VADDR(info->user_vstart, u_idx, 0), 
++                                    req->nr_pages);
++      }
++
++      if (locked)
++              up_write(&mm->mmap_sem);
++}
++
++/******************************************************************
++ * SCHEDULER FUNCTIONS
++ */
++
++static void print_stats(blkif_t *blkif)
++{
++      printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d |  pk %4d\n",
++             current->comm, blkif->st_oo_req,
++             blkif->st_rd_req, blkif->st_wr_req, blkif->st_pk_req);
++      blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
++      blkif->st_rd_req = 0;
++      blkif->st_wr_req = 0;
++      blkif->st_oo_req = 0;
++      blkif->st_pk_req = 0;
++}
++
++int tap_blkif_schedule(void *arg)
++{
++      blkif_t *blkif = arg;
++      tap_blkif_t *info;
++
++      blkif_get(blkif);
++
++      if (debug_lvl)
++              printk(KERN_DEBUG "%s: started\n", current->comm);
++
++      while (!kthread_should_stop()) {
++              if (try_to_freeze())
++                      continue;
++
++              wait_event_interruptible(
++                      blkif->wq,
++                      blkif->waiting_reqs || kthread_should_stop());
++              wait_event_interruptible(
++                      pending_free_wq,
++                      !list_empty(&pending_free) || kthread_should_stop());
++
++              blkif->waiting_reqs = 0;
++              smp_mb(); /* clear flag *before* checking for work */
++
++              if (do_block_io_op(blkif))
++                      blkif->waiting_reqs = 1;
++
++              if (log_stats && time_after(jiffies, blkif->st_print))
++                      print_stats(blkif);
++      }
++
++      if (log_stats)
++              print_stats(blkif);
++      if (debug_lvl)
++              printk(KERN_DEBUG "%s: exiting\n", current->comm);
++
++      blkif->xenblkd = NULL;
++      info = tapfds[blkif->dev_num];
++      blkif_put(blkif);
++
++      if (info) {
++              struct mm_struct *mm = xchg(&info->mm, NULL);
++
++              if (mm)
++                      mmput(mm);
++      }
++
++      return 0;
++}
++
++/******************************************************************
++ * COMPLETION CALLBACK -- Called by user level ioctl()
++ */
++
++static int blktap_read_ufe_ring(tap_blkif_t *info)
++{
++      /* This is called to read responses from the UFE ring. */
++      RING_IDX i, j, rp;
++      blkif_response_t *resp;
++      blkif_t *blkif=NULL;
++      unsigned int pending_idx, usr_idx, mmap_idx;
++      pending_req_t *pending_req;
++      
++      if (!info)
++              return 0;
++
++      /* We currently only forward packets in INTERCEPT_FE mode. */
++      if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
++              return 0;
++
++      /* for each outstanding message on the UFEring  */
++      rp = info->ufe_ring.sring->rsp_prod;
++      rmb();
++        
++      for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
++              blkif_response_t res;
++              resp = RING_GET_RESPONSE(&info->ufe_ring, i);
++              memcpy(&res, resp, sizeof(res));
++              mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
++              ++info->ufe_ring.rsp_cons;
++
++              /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
++              if (res.id >= MAX_PENDING_REQS) {
++                      WPRINTK("incorrect req map [%llx]\n",
++                              (unsigned long long)res.id);
++                      continue;
++              }
++
++              usr_idx = (unsigned int)res.id;
++              pending_idx = info->idx_map[usr_idx].req;
++              mmap_idx = info->idx_map[usr_idx].mem;
++
++              if (mmap_idx >= mmap_alloc ||
++                  pending_idx >= MAX_PENDING_REQS) {
++                      WPRINTK("incorrect req map [%d],"
++                              " internal map [%d,%d]\n",
++                              usr_idx, mmap_idx, pending_idx);
++                      continue;
++              }
++
++              pending_req = &pending_reqs[mmap_idx][pending_idx];
++              blkif = pending_req->blkif;
++
++              for (j = 0; j < pending_req->nr_pages; j++) {
++
++                      unsigned long uvaddr;
++                      struct page *pg;
++                      int offset;
++
++                      uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
++
++                      pg = idx_to_page(mmap_idx, pending_idx, j);
++                      ClearPageReserved(pg);
++                      offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
++                      info->foreign_map.map[offset] = NULL;
++              }
++              fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
++              info->idx_map[usr_idx].mem = INVALID_MIDX;
++              make_response(blkif, pending_req->id, res.operation,
++                            res.status);
++              blkif_put(pending_req->blkif);
++              free_req(pending_req);
++      }
++              
++      return 0;
++}
++
++
++/******************************************************************************
++ * NOTIFICATION FROM GUEST OS.
++ */
++
++static void blkif_notify_work(blkif_t *blkif)
++{
++      blkif->waiting_reqs = 1;
++      wake_up(&blkif->wq);
++}
++
++irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
++{
++      blkif_notify_work(dev_id);
++      return IRQ_HANDLED;
++}
++
++
++
++/******************************************************************
++ * DOWNWARD CALLS -- These interface with the block-device layer proper.
++ */
++static int print_dbug = 1;
++static int do_block_io_op(blkif_t *blkif)
++{
++      blkif_back_rings_t *blk_rings = &blkif->blk_rings;
++      blkif_request_t req;
++      pending_req_t *pending_req;
++      RING_IDX rc, rp;
++      int more_to_do = 0;
++      tap_blkif_t *info;
++
++      rc = blk_rings->common.req_cons;
++      rp = blk_rings->common.sring->req_prod;
++      rmb(); /* Ensure we see queued requests up to 'rp'. */
++
++      /*Check blkif has corresponding UE ring*/
++      if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV) {
++              /*oops*/
++              if (print_dbug) {
++                      WPRINTK("Corresponding UE " 
++                             "ring does not exist!\n");
++                      print_dbug = 0; /*We only print this message once*/
++              }
++              return 0;
++      }
++
++      info = tapfds[blkif->dev_num];
++
++      if (!info || !test_bit(0, &info->dev_inuse)) {
++              if (print_dbug) {
++                      WPRINTK("Can't get UE info!\n");
++                      print_dbug = 0;
++              }
++              return 0;
++      }
++
++      while (rc != rp) {
++              
++              if (RING_FULL(&info->ufe_ring)) {
++                      WPRINTK("RING_FULL! More to do\n");
++                      more_to_do = 1;
++                      break;
++              }
++
++              if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) {
++                      WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
++                             " More to do\n");
++                      more_to_do = 1;
++                      break;          
++              }
++
++              if (kthread_should_stop()) {
++                      more_to_do = 1;
++                      break;
++              }
++
++              pending_req = alloc_req();
++              if (NULL == pending_req) {
++                      blkif->st_oo_req++;
++                      more_to_do = 1;
++                      break;
++              }
++
++              switch (blkif->blk_protocol) {
++              case BLKIF_PROTOCOL_NATIVE:
++                      memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc),
++                             sizeof(req));
++                      break;
++              case BLKIF_PROTOCOL_X86_32:
++                      blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
++                      break;
++              case BLKIF_PROTOCOL_X86_64:
++                      blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
++                      break;
++              default:
++                      BUG();
++              }
++              blk_rings->common.req_cons = ++rc; /* before make_response() */
++
++              /* Apply all sanity checks to /private copy/ of request. */
++              barrier();
++
++              switch (req.operation) {
++              case BLKIF_OP_READ:
++                      blkif->st_rd_req++;
++                      dispatch_rw_block_io(blkif, &req, pending_req);
++                      break;
++
++              case BLKIF_OP_WRITE_BARRIER:
++                      /* TODO Some counter? */
++                      /* Fall through */
++              case BLKIF_OP_WRITE:
++                      blkif->st_wr_req++;
++                      dispatch_rw_block_io(blkif, &req, pending_req);
++                      break;
++
++              case BLKIF_OP_PACKET:
++                      blkif->st_pk_req++;
++                      dispatch_rw_block_io(blkif, &req, pending_req);
++                      break;
++
++              default:
++                      /* A good sign something is wrong: sleep for a while to
++                       * avoid excessive CPU consumption by a bad guest. */
++                      msleep(1);
++                      WPRINTK("unknown operation [%d]\n",
++                              req.operation);
++                      make_response(blkif, req.id, req.operation,
++                                    BLKIF_RSP_ERROR);
++                      free_req(pending_req);
++                      break;
++              }
++
++              /* Yield point for this unbounded loop. */
++              cond_resched();
++      }
++              
++      blktap_kick_user(blkif->dev_num);
++
++      return more_to_do;
++}
++
++static void dispatch_rw_block_io(blkif_t *blkif,
++                               blkif_request_t *req,
++                               pending_req_t *pending_req)
++{
++      struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
++      unsigned int nseg;
++      int ret, i, op, nr_sects = 0;
++      tap_blkif_t *info;
++      blkif_request_t *target;
++      unsigned int mmap_idx = pending_req->mem_idx;
++      unsigned int pending_idx = RTN_PEND_IDX(pending_req, mmap_idx);
++      unsigned int usr_idx;
++      uint32_t flags;
++      struct mm_struct *mm;
++      struct vm_area_struct *vma = NULL;
++
++      if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV)
++              goto fail_response;
++
++      info = tapfds[blkif->dev_num];
++      if (info == NULL)
++              goto fail_response;
++
++      /* Check we have space on user ring - should never fail. */
++      usr_idx = GET_NEXT_REQ(info->idx_map);
++      if (usr_idx >= MAX_PENDING_REQS) {
++              WARN_ON(1);
++              goto fail_response;
++      }
++
++      /* Check that number of segments is sane. */
++      nseg = req->nr_segments;
++      if (unlikely(nseg == 0 && req->operation != BLKIF_OP_WRITE_BARRIER) ||
++          unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
++              WPRINTK("Bad number of segments in request (%d)\n", nseg);
++              goto fail_response;
++      }
++      
++      /* Make sure userspace is ready. */
++      if (!info->ring_ok) {
++              WPRINTK("blktap: ring not ready for requests!\n");
++              goto fail_response;
++      }
++      smp_rmb();
++
++      if (RING_FULL(&info->ufe_ring)) {
++              WPRINTK("blktap: fe_ring is full, can't add "
++                      "IO Request will be dropped. %d %d\n",
++                      RING_SIZE(&info->ufe_ring),
++                      RING_SIZE(&blkif->blk_rings.common));
++              goto fail_response;
++      }
++
++      pending_req->blkif     = blkif;
++      pending_req->id        = req->id;
++      pending_req->nr_pages  = nseg;
++
++      flags = GNTMAP_host_map;
++      switch (req->operation) {
++      case BLKIF_OP_WRITE:
++      case BLKIF_OP_WRITE_BARRIER:
++              flags |= GNTMAP_readonly;
++              break;
++      }
++
++      op = 0;
++      mm = info->mm;
++      if (!xen_feature(XENFEAT_auto_translated_physmap))
++              down_write(&mm->mmap_sem);
++      for (i = 0; i < nseg; i++) {
++              unsigned long uvaddr;
++              unsigned long kvaddr;
++              uint64_t ptep;
++
++              uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
++              kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
++
++              gnttab_set_map_op(&map[op], kvaddr, flags,
++                                req->seg[i].gref, blkif->domid);
++              op++;
++
++              if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++                      /* Now map it to user. */
++                      ret = create_lookup_pte_addr(mm, uvaddr, &ptep);
++                      if (ret) {
++                              up_write(&mm->mmap_sem);
++                              WPRINTK("Couldn't get a pte addr!\n");
++                              goto fail_flush;
++                      }
++
++                      gnttab_set_map_op(&map[op], ptep,
++                                        flags | GNTMAP_application_map
++                                              | GNTMAP_contains_pte,
++                                        req->seg[i].gref, blkif->domid);
++                      op++;
++              }
++
++              nr_sects += (req->seg[i].last_sect - 
++                           req->seg[i].first_sect + 1);
++      }
++
++      ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
++      BUG_ON(ret);
++
++      if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++              up_write(&mm->mmap_sem);
++
++              for (i = 0; i < (nseg*2); i+=2) {
++                      unsigned long uvaddr;
++                      unsigned long offset;
++                      struct page *pg;
++
++                      uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
++
++                      gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
++
++                      if (unlikely(map[i].status != GNTST_okay)) {
++                              WPRINTK("invalid kernel buffer -- could not remap it\n");
++                              ret = 1;
++                              map[i].handle = INVALID_GRANT_HANDLE;
++                      }
++
++                      if (unlikely(map[i+1].status != GNTST_okay)) {
++                              WPRINTK("invalid kernel buffer -- could not remap it\n");
++                              ret = 1;
++                              map[i+1].handle = INVALID_GRANT_HANDLE;
++                      }
++
++                      pending_handle(mmap_idx, pending_idx, i/2).kernel 
++                              = map[i].handle;
++                      pending_handle(mmap_idx, pending_idx, i/2).user   
++                              = map[i+1].handle;
++
++                      if (ret)
++                              continue;
++
++                      pg = idx_to_page(mmap_idx, pending_idx, i/2);
++                      set_phys_to_machine(page_to_pfn(pg),
++                                          FOREIGN_FRAME(map[i].dev_bus_addr
++                                                        >> PAGE_SHIFT));
++                      offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
++                      info->foreign_map.map[offset] = pg;
++              }
++      } else {
++              for (i = 0; i < nseg; i++) {
++                      unsigned long uvaddr;
++                      unsigned long offset;
++                      struct page *pg;
++
++                      uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
++
++                      gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
++
++                      if (unlikely(map[i].status != GNTST_okay)) {
++                              WPRINTK("invalid kernel buffer -- could not remap it\n");
++                              ret = 1;
++                              map[i].handle = INVALID_GRANT_HANDLE;
++                      }
++
++                      pending_handle(mmap_idx, pending_idx, i).kernel 
++                              = map[i].handle;
++
++                      if (ret)
++                              continue;
++
++                      offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
++                      pg = idx_to_page(mmap_idx, pending_idx, i);
++                      info->foreign_map.map[offset] = pg;
++              }
++      }
++
++      if (ret)
++              goto fail_flush;
++
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              down_write(&mm->mmap_sem);
++      /* Mark mapped pages as reserved: */
++      for (i = 0; i < req->nr_segments; i++) {
++              struct page *pg;
++
++              pg = idx_to_page(mmap_idx, pending_idx, i);
++              SetPageReserved(pg);
++              if (xen_feature(XENFEAT_auto_translated_physmap)) {
++                      unsigned long uvaddr = MMAP_VADDR(info->user_vstart,
++                                                        usr_idx, i);
++                      if (vma && uvaddr >= vma->vm_end) {
++                              vma = vma->vm_next;
++                              if (vma &&
++                                  (uvaddr < vma->vm_start ||
++                                   uvaddr >= vma->vm_end))
++                                      vma = NULL;
++                      }
++                      if (vma == NULL) {
++                              vma = find_vma(mm, uvaddr);
++                              /* this virtual area was already munmapped.
++                                 so skip to next page */
++                              if (!vma)
++                                      continue;
++                      }
++                      ret = vm_insert_page(vma, uvaddr, pg);
++                      if (ret) {
++                              up_write(&mm->mmap_sem);
++                              goto fail_flush;
++                      }
++              }
++      }
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              up_write(&mm->mmap_sem);
++      
++      /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
++      info->idx_map[usr_idx].mem = mmap_idx;
++      info->idx_map[usr_idx].req = pending_idx;
++
++      blkif_get(blkif);
++      /* Finally, write the request message to the user ring. */
++      target = RING_GET_REQUEST(&info->ufe_ring,
++                                info->ufe_ring.req_prod_pvt);
++      memcpy(target, req, sizeof(*req));
++      target->id = usr_idx;
++      wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
++      info->ufe_ring.req_prod_pvt++;
++
++      switch (req->operation) {
++      case BLKIF_OP_READ:
++              blkif->st_rd_sect += nr_sects;
++              break;
++      case BLKIF_OP_WRITE:
++      case BLKIF_OP_WRITE_BARRIER:
++              blkif->st_wr_sect += nr_sects;
++              break;
++      }
++
++      return;
++
++ fail_flush:
++      WPRINTK("Reached Fail_flush\n");
++      fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num);
++ fail_response:
++      make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
++      free_req(pending_req);
++      msleep(1); /* back off a bit */
++}
++
++
++
++/******************************************************************
++ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
++ */
++
++
++static void make_response(blkif_t *blkif, u64 id,
++                          unsigned short op, int st)
++{
++      blkif_response_t  resp;
++      unsigned long     flags;
++      blkif_back_rings_t *blk_rings = &blkif->blk_rings;
++      int more_to_do = 0;
++      int notify;
++
++      resp.id        = id;
++      resp.operation = op;
++      resp.status    = st;
++
++      spin_lock_irqsave(&blkif->blk_ring_lock, flags);
++      /* Place on the response ring for the relevant domain. */
++      switch (blkif->blk_protocol) {
++      case BLKIF_PROTOCOL_NATIVE:
++              memcpy(RING_GET_RESPONSE(&blk_rings->native,
++                                       blk_rings->native.rsp_prod_pvt),
++                     &resp, sizeof(resp));
++              break;
++      case BLKIF_PROTOCOL_X86_32:
++              memcpy(RING_GET_RESPONSE(&blk_rings->x86_32,
++                                       blk_rings->x86_32.rsp_prod_pvt),
++                     &resp, sizeof(resp));
++              break;
++      case BLKIF_PROTOCOL_X86_64:
++              memcpy(RING_GET_RESPONSE(&blk_rings->x86_64,
++                                       blk_rings->x86_64.rsp_prod_pvt),
++                     &resp, sizeof(resp));
++              break;
++      default:
++              BUG();
++      }
++      blk_rings->common.rsp_prod_pvt++;
++      RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
++
++      if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
++              /*
++               * Tail check for pending requests. Allows frontend to avoid
++               * notifications if requests are already in flight (lower
++               * overheads and promotes batching).
++               */
++              RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
++      } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
++              more_to_do = 1;
++      }
++
++      spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
++      if (more_to_do)
++              blkif_notify_work(blkif);
++      if (notify)
++              notify_remote_via_irq(blkif->irq);
++}
++
++static int __init blkif_init(void)
++{
++      int i, ret;
++
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      INIT_LIST_HEAD(&pending_free);
++        for(i = 0; i < 2; i++) {
++              ret = req_increase();
++              if (ret)
++                      break;
++      }
++      if (i == 0)
++              return ret;
++
++      tap_blkif_interface_init();
++
++      alloc_pending_reqs = 0;
++
++      tap_blkif_xenbus_init();
++
++      /* Dynamically allocate a major for this device */
++      ret = __register_chrdev(0, 0, MAX_TAP_DEV, "blktap", &blktap_fops);
++
++      if (ret < 0) {
++              WPRINTK("Couldn't register /dev/xen/blktap\n");
++              return -ENOMEM;
++      }       
++      
++      blktap_major = ret;
++
++      /* tapfds[0] is always NULL */
++      blktap_next_minor++;
++
++      DPRINTK("Created misc_dev %d:0 [/dev/xen/blktap0]\n", ret);
++
++      /* Make sure the xen class exists */
++      if (get_xen_class()) {
++              /*
++               * This will allow udev to create the blktap ctrl device.
++               * We only want to create blktap0 first.  We don't want
++               * to flood the sysfs system with needless blktap devices.
++               * We only create the device when a request of a new device is
++               * made.
++               */
++              xen_class_device_create(&blktap_type, NULL,
++                                      MKDEV(blktap_major, 0), NULL,
++                                      "blktap0");
++      } else {
++              /* this is bad, but not fatal */
++              WPRINTK("blktap: sysfs xen_class not created\n");
++      }
++
++      DPRINTK("Blktap device successfully created\n");
++
++      return 0;
++}
++
++module_init(blkif_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
++MODULE_ALIAS("devname:xen/blktap0");
diff --cc drivers/xen/blktap/blocktap.c

index 0000000,0000000..31973c0

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap/blocktap.c
@@@ -1,0 -1,0 +1,1 @@@
++#include "blktap.c"
diff --cc drivers/xen/blktap/common.h

index 0000000,0000000..a4cc54e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap/common.h
@@@ -1,0 -1,0 +1,114 @@@
++/* 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __BLKIF__BACKEND__COMMON_H__
++#define __BLKIF__BACKEND__COMMON_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/blkdev.h>
++#include <asm/hypervisor.h>
++#include <xen/blkif.h>
++#include <xen/xenbus.h>
++#include <xen/interface/event_channel.h>
++
++#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
++                                    __FILE__ , __LINE__ , ## _a )
++
++#define WPRINTK(fmt, args...) pr_warning("blktap: " fmt, ##args)
++
++struct backend_info;
++
++typedef struct blkif_st {
++      /* Unique identifier for this interface. */
++      domid_t           domid;
++      unsigned int      handle;
++      /* Physical parameters of the comms window. */
++      unsigned int      irq;
++      /* Comms information. */
++      enum blkif_protocol blk_protocol;
++      blkif_back_rings_t blk_rings;
++      struct vm_struct *blk_ring_area;
++      /* Back pointer to the backend_info. */
++      struct backend_info *be;
++      /* Private fields. */
++      spinlock_t       blk_ring_lock;
++      atomic_t         refcnt;
++
++      wait_queue_head_t   wq;
++      struct task_struct  *xenblkd;
++      unsigned int        waiting_reqs;
++      struct request_queue *plug;
++
++      /* statistics */
++      unsigned long       st_print;
++      int                 st_rd_req;
++      int                 st_wr_req;
++      int                 st_oo_req;
++      int                 st_pk_req;
++      int                 st_rd_sect;
++      int                 st_wr_sect;
++
++      wait_queue_head_t waiting_to_free;
++
++      int             dev_num;
++      uint64_t        sectors;
++} blkif_t;
++
++blkif_t *tap_alloc_blkif(domid_t domid);
++void tap_blkif_free(blkif_t *, struct xenbus_device *);
++void tap_blkif_kmem_cache_free(blkif_t *blkif);
++int tap_blkif_map(blkif_t *, struct xenbus_device *, grant_ref_t,
++                evtchn_port_t);
++
++#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
++#define blkif_put(_b)                                 \
++      do {                                            \
++              if (atomic_dec_and_test(&(_b)->refcnt)) \
++                      wake_up(&(_b)->waiting_to_free);\
++      } while (0)
++
++
++struct phys_req {
++      unsigned short       dev;
++      unsigned short       nr_sects;
++      struct block_device *bdev;
++      blkif_sector_t       sector_number;
++};
++
++void tap_blkif_interface_init(void);
++
++void tap_blkif_xenbus_init(void);
++
++irqreturn_t tap_blkif_be_int(int irq, void *dev_id);
++int tap_blkif_schedule(void *arg);
++
++int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
++void signal_tapdisk(int idx);
++
++#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --cc drivers/xen/blktap/interface.c

index 0000000,0000000..155e767

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap/interface.c
@@@ -1,0 -1,0 +1,140 @@@
++/******************************************************************************
++ * drivers/xen/blktap/interface.c
++ * 
++ * Block-device interface management.
++ * 
++ * Copyright (c) 2004, Keir Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++
++ */
++
++#include "common.h"
++#include <xen/evtchn.h>
++#include <linux/vmalloc.h>
++
++static struct kmem_cache *blkif_cachep;
++
++blkif_t *tap_alloc_blkif(domid_t domid)
++{
++      blkif_t *blkif;
++
++      blkif = kmem_cache_zalloc(blkif_cachep, GFP_KERNEL);
++      if (!blkif)
++              return ERR_PTR(-ENOMEM);
++
++      blkif->domid = domid;
++      spin_lock_init(&blkif->blk_ring_lock);
++      atomic_set(&blkif->refcnt, 1);
++      init_waitqueue_head(&blkif->wq);
++      blkif->st_print = jiffies;
++      init_waitqueue_head(&blkif->waiting_to_free);
++
++      return blkif;
++}
++
++int tap_blkif_map(blkif_t *blkif, struct xenbus_device *dev,
++                grant_ref_t ring_ref, evtchn_port_t evtchn)
++{
++      struct vm_struct *area;
++      int err;
++
++      /* Already connected through? */
++      if (blkif->irq)
++              return 0;
++
++      area = xenbus_map_ring_valloc(dev, ring_ref);
++      if (IS_ERR(area))
++              return PTR_ERR(area);
++      blkif->blk_ring_area = area;
++
++      switch (blkif->blk_protocol) {
++      case BLKIF_PROTOCOL_NATIVE:
++      {
++              blkif_sring_t *sring;
++              sring = (blkif_sring_t *)area->addr;
++              BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
++              break;
++      }
++      case BLKIF_PROTOCOL_X86_32:
++      {
++              blkif_x86_32_sring_t *sring_x86_32;
++              sring_x86_32 = (blkif_x86_32_sring_t *)area->addr;
++              BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
++              break;
++      }
++      case BLKIF_PROTOCOL_X86_64:
++      {
++              blkif_x86_64_sring_t *sring_x86_64;
++              sring_x86_64 = (blkif_x86_64_sring_t *)area->addr;
++              BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
++              break;
++      }
++      default:
++              BUG();
++      }
++
++      err = bind_interdomain_evtchn_to_irqhandler(
++              blkif->domid, evtchn, tap_blkif_be_int,
++              0, "blkif-backend", blkif);
++      if (err < 0) {
++              xenbus_unmap_ring_vfree(dev, area);
++              blkif->blk_rings.common.sring = NULL;
++              return err;
++      }
++      blkif->irq = err;
++
++      return 0;
++}
++
++void tap_blkif_free(blkif_t *blkif, struct xenbus_device *dev)
++{
++      atomic_dec(&blkif->refcnt);
++      wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
++      atomic_inc(&blkif->refcnt);
++
++      if (blkif->irq) {
++              unbind_from_irqhandler(blkif->irq, blkif);
++              blkif->irq = 0;
++      }
++
++      if (blkif->blk_rings.common.sring) {
++              xenbus_unmap_ring_vfree(dev, blkif->blk_ring_area);
++              blkif->blk_rings.common.sring = NULL;
++      }
++}
++
++void tap_blkif_kmem_cache_free(blkif_t *blkif)
++{
++      if (!atomic_dec_and_test(&blkif->refcnt))
++              BUG();
++      kmem_cache_free(blkif_cachep, blkif);
++}
++
++void __init tap_blkif_interface_init(void)
++{
++      blkif_cachep = kmem_cache_create("blktapif_cache", sizeof(blkif_t), 
++                                       0, 0, NULL);
++}
diff --cc drivers/xen/blktap/xenbus.c

index 0000000,0000000..38378f5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap/xenbus.c
@@@ -1,0 -1,0 +1,531 @@@
++/* drivers/xen/blktap/xenbus.c
++ *
++ * Xenbus code for blktap
++ *
++ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
++ *
++ * Based on the blkback xenbus code:
++ *
++ * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
++ * Copyright (C) 2005 XenSource Ltd
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <stdarg.h>
++#include <linux/module.h>
++#include <linux/kthread.h>
++#include <xen/xenbus.h>
++#include "common.h"
++#include "../core/domctl.h"
++
++
++struct backend_info
++{
++      struct xenbus_device *dev;
++      blkif_t *blkif;
++      struct xenbus_watch backend_watch;
++      int xenbus_id;
++      int group_added;
++};
++
++static void connect(struct backend_info *);
++static int connect_ring(struct backend_info *);
++static int blktap_remove(struct xenbus_device *dev);
++static int blktap_probe(struct xenbus_device *dev,
++                       const struct xenbus_device_id *id);
++static void tap_backend_changed(struct xenbus_watch *, const char **,
++                          unsigned int);
++static void tap_frontend_changed(struct xenbus_device *dev,
++                           enum xenbus_state frontend_state);
++
++static int strsep_len(const char *str, char c, unsigned int len)
++{
++        unsigned int i;
++
++        for (i = 0; str[i]; i++)
++                if (str[i] == c) {
++                        if (len == 0)
++                                return i;
++                        len--;
++                }
++        return (len == 0) ? i : -ERANGE;
++}
++
++static long get_id(const char *str)
++{
++        int len,end;
++        const char *ptr;
++        char *tptr, num[10];
++      
++        len = strsep_len(str, '/', 2);
++        end = strlen(str);
++        if ( (len < 0) || (end < 0) ) return -1;
++      
++        ptr = str + len + 1;
++        strncpy(num,ptr,end - len);
++        tptr = num + (end - (len + 1));
++        *tptr = '\0';
++      DPRINTK("Get_id called for %s (%s)\n",str,num);
++      
++        return simple_strtol(num, NULL, 10);
++}                             
++
++static int blktap_name(blkif_t *blkif, char *buf)
++{
++      char *devpath, *devname;
++      struct xenbus_device *dev = blkif->be->dev;
++
++      devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
++      if (IS_ERR(devpath)) 
++              return PTR_ERR(devpath);
++      
++      if ((devname = strstr(devpath, "/dev/")) != NULL)
++              devname += strlen("/dev/");
++      else
++              devname  = devpath;
++
++      snprintf(buf, TASK_COMM_LEN, "blktap.%d.%s", blkif->domid, devname);
++      kfree(devpath);
++      
++      return 0;
++}
++
++/****************************************************************
++ *  sysfs interface for I/O requests of blktap device
++ */
++
++#define VBD_SHOW(name, format, args...)                                       \
++      static ssize_t show_##name(struct device *_dev,                 \
++                                 struct device_attribute *attr,       \
++                                 char *buf)                           \
++      {                                                               \
++              ssize_t ret = -ENODEV;                                  \
++              struct xenbus_device *dev;                              \
++              struct backend_info *be;                                \
++                                                                      \
++              if (!get_device(_dev))                                  \
++                      return ret;                                     \
++              dev = to_xenbus_device(_dev);                           \
++              if ((be = dev_get_drvdata(&dev->dev)) != NULL)          \
++                      ret = sprintf(buf, format, ##args);             \
++              put_device(_dev);                                       \
++              return ret;                                             \
++      }                                                               \
++      static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
++
++VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
++VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
++VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
++VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
++VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
++
++static struct attribute *tapstat_attrs[] = {
++      &dev_attr_oo_req.attr,
++      &dev_attr_rd_req.attr,
++      &dev_attr_wr_req.attr,
++      &dev_attr_rd_sect.attr,
++      &dev_attr_wr_sect.attr,
++      NULL
++};
++
++static struct attribute_group tapstat_group = {
++      .name = "statistics",
++      .attrs = tapstat_attrs,
++};
++
++int xentap_sysfs_addif(struct xenbus_device *dev)
++{
++      int err;
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++      err = sysfs_create_group(&dev->dev.kobj, &tapstat_group);
++      if (!err)
++              be->group_added = 1;
++      return err;
++}
++
++void xentap_sysfs_delif(struct xenbus_device *dev)
++{
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++      sysfs_remove_group(&dev->dev.kobj, &tapstat_group);
++      be->group_added = 0;
++}
++
++static int blktap_remove(struct xenbus_device *dev)
++{
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++      if (be->group_added)
++              xentap_sysfs_delif(be->dev);
++      if (be->backend_watch.node) {
++              unregister_xenbus_watch(&be->backend_watch);
++              kfree(be->backend_watch.node);
++              be->backend_watch.node = NULL;
++      }
++      if (be->blkif) {
++              if (be->blkif->xenblkd)
++                      kthread_stop(be->blkif->xenblkd);
++              signal_tapdisk(be->blkif->dev_num);
++              tap_blkif_free(be->blkif, dev);
++              tap_blkif_kmem_cache_free(be->blkif);
++              be->blkif = NULL;
++      }
++      kfree(be);
++      dev_set_drvdata(&dev->dev, NULL);
++      return 0;
++}
++
++static void tap_update_blkif_status(blkif_t *blkif)
++{ 
++      int err;
++      char name[TASK_COMM_LEN];
++
++      /* Not ready to connect? */
++      if(!blkif->irq || !blkif->sectors) {
++              return;
++      } 
++
++      /* Already connected? */
++      if (blkif->be->dev->state == XenbusStateConnected)
++              return;
++
++      /* Attempt to connect: exit if we fail to. */
++      connect(blkif->be);
++      if (blkif->be->dev->state != XenbusStateConnected)
++              return;
++
++      err = blktap_name(blkif, name);
++      if (err) {
++              xenbus_dev_error(blkif->be->dev, err, "get blktap dev name");
++              return;
++      }
++
++      if (!blkif->be->group_added) {
++              err = xentap_sysfs_addif(blkif->be->dev);
++              if (err) {
++                      xenbus_dev_fatal(blkif->be->dev, err, 
++                                       "creating sysfs entries");
++                      return;
++              }
++      }
++
++      blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif, name);
++      if (IS_ERR(blkif->xenblkd)) {
++              err = PTR_ERR(blkif->xenblkd);
++              blkif->xenblkd = NULL;
++              xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd");
++              WPRINTK("Error starting thread %s\n", name);
++      } else
++              DPRINTK("Thread started for domid %d, connected disk %d\n",
++                      blkif->domid, blkif->dev_num);
++
++}
++
++/**
++ * Entry point to this code when a new device is created.  Allocate
++ * the basic structures, and watch the store waiting for the
++ * user-space program to tell us the physical device info.  Switch to
++ * InitWait.
++ */
++static int blktap_probe(struct xenbus_device *dev,
++                       const struct xenbus_device_id *id)
++{
++      int err;
++      struct backend_info *be = kzalloc(sizeof(struct backend_info),
++                                        GFP_KERNEL);
++      if (!be) {
++              xenbus_dev_fatal(dev, -ENOMEM,
++                               "allocating backend structure");
++              return -ENOMEM;
++      }
++
++      be->dev = dev;
++      dev_set_drvdata(&dev->dev, be);
++      be->xenbus_id = get_id(dev->nodename);
++
++      be->blkif = tap_alloc_blkif(dev->otherend_id);
++      if (IS_ERR(be->blkif)) {
++              err = PTR_ERR(be->blkif);
++              be->blkif = NULL;
++              xenbus_dev_fatal(dev, err, "creating block interface");
++              goto fail;
++      }
++
++      /* setup back pointer */
++      be->blkif->be = be;
++      be->blkif->sectors = 0;
++
++      /* set a watch on disk info, waiting for userspace to update details*/
++      err = xenbus_watch_path2(dev, dev->nodename, "info",
++                               &be->backend_watch, tap_backend_changed);
++      if (err)
++              goto fail;
++      
++      err = xenbus_switch_state(dev, XenbusStateInitWait);
++      if (err)
++              goto fail;
++      return 0;
++
++fail:
++      DPRINTK("blktap probe failed\n");
++      blktap_remove(dev);
++      return err;
++}
++
++
++/**
++ * Callback received when the user space code has placed the device
++ * information in xenstore. 
++ */
++static void tap_backend_changed(struct xenbus_watch *watch,
++                          const char **vec, unsigned int len)
++{
++      int err;
++      unsigned long info;
++      struct backend_info *be
++              = container_of(watch, struct backend_info, backend_watch);
++      struct xenbus_device *dev = be->dev;
++      
++      /** 
++       * Check to see whether userspace code has opened the image 
++       * and written sector
++       * and disk info to xenstore
++       */
++      err = xenbus_gather(XBT_NIL, dev->nodename, "info", "%lu", &info, 
++                          NULL);
++      if (XENBUS_EXIST_ERR(err))
++              return;
++      if (err) {
++              xenbus_dev_error(dev, err, "getting info");
++              return;
++      }
++
++      DPRINTK("Userspace update on disk info, %lu\n",info);
++
++      err = xenbus_gather(XBT_NIL, dev->nodename, "sectors", "%llu", 
++                          &be->blkif->sectors, NULL);
++
++      /* Associate tap dev with domid*/
++      be->blkif->dev_num = dom_to_devid(be->blkif->domid, be->xenbus_id, 
++                                        be->blkif);
++
++      tap_update_blkif_status(be->blkif);
++}
++
++
++static void blkif_disconnect(blkif_t *blkif)
++{
++      if (blkif->xenblkd) {
++              kthread_stop(blkif->xenblkd);
++              blkif->xenblkd = NULL;
++      }
++
++      /* idempotent */
++      tap_blkif_free(blkif, blkif->be->dev);
++}
++
++/**
++ * Callback received when the frontend's state changes.
++ */
++static void tap_frontend_changed(struct xenbus_device *dev,
++                           enum xenbus_state frontend_state)
++{
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++      int err;
++
++      DPRINTK("fe_changed(%s,%d)\n", dev->nodename, frontend_state);
++
++      switch (frontend_state) {
++      case XenbusStateInitialising:
++              if (dev->state == XenbusStateClosed) {
++                      pr_info("%s: %s: prepare for reconnect\n",
++                              __FUNCTION__, dev->nodename);
++                      xenbus_switch_state(dev, XenbusStateInitWait);
++              }
++              break;
++
++      case XenbusStateInitialised:
++      case XenbusStateConnected:
++              /* Ensure we connect even when two watches fire in 
++                 close successsion and we miss the intermediate value 
++                 of frontend_state. */
++              if (dev->state == XenbusStateConnected)
++                      break;
++
++              /* Enforce precondition before potential leak point.
++               * blkif_disconnect() is idempotent.
++               */
++              blkif_disconnect(be->blkif);
++
++              err = connect_ring(be);
++              if (err)
++                      break;
++              tap_update_blkif_status(be->blkif);
++              break;
++
++      case XenbusStateClosing:
++              blkif_disconnect(be->blkif);
++              xenbus_switch_state(dev, XenbusStateClosing);
++              break;
++
++      case XenbusStateClosed:
++              xenbus_switch_state(dev, XenbusStateClosed);
++              if (xenbus_dev_is_online(dev))
++                      break;
++              /* fall through if not online */
++      case XenbusStateUnknown:
++              /* Implies the effects of blkif_disconnect() via
++               * blktap_remove().
++               */
++              device_unregister(&dev->dev);
++              break;
++
++      default:
++              xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++                               frontend_state);
++              break;
++      }
++}
++
++
++/**
++ * Switch to Connected state.
++ */
++static void connect(struct backend_info *be)
++{
++      int err;
++
++      struct xenbus_device *dev = be->dev;
++      struct xenbus_transaction xbt;
++
++      /* Write feature-barrier to xenstore */
++again:
++      err = xenbus_transaction_start(&xbt);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "starting transaction");
++              return;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "feature-barrier",  "1");
++      if (err) {
++              xenbus_dev_fatal(dev, err, "writing feature-barrier");
++              xenbus_transaction_end(xbt, 1);
++              return;
++      }
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err == -EAGAIN)
++              goto again;
++
++      /* Switch state */
++      err = xenbus_switch_state(dev, XenbusStateConnected);
++      if (err)
++              xenbus_dev_fatal(dev, err, "switching to Connected state",
++                               dev->nodename);
++
++      return;
++}
++
++
++static int connect_ring(struct backend_info *be)
++{
++      struct xenbus_device *dev = be->dev;
++      unsigned long ring_ref;
++      unsigned int evtchn;
++      char protocol[64];
++      int err;
++
++      DPRINTK("%s\n", dev->otherend);
++
++      err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", 
++                          &ring_ref, "event-channel", "%u", &evtchn, NULL);
++      if (err) {
++              xenbus_dev_fatal(dev, err,
++                               "reading %s/ring-ref and event-channel",
++                               dev->otherend);
++              return err;
++      }
++
++      be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
++      err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
++                          "%63s", protocol, NULL);
++      if (err) {
++              strcpy(protocol, "unspecified");
++              be->blkif->blk_protocol = xen_guest_blkif_protocol(be->blkif->domid);
++      }
++      else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
++              be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
++      else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
++              be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
++      else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
++              be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
++#if 1 /* maintain compatibility with early sles10-sp1 and paravirt netware betas */
++      else if (0 == strcmp(protocol, "1"))
++              be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
++      else if (0 == strcmp(protocol, "2"))
++              be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
++#endif
++      else {
++              xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
++              return -1;
++      }
++      pr_info("blktap: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
++              ring_ref, evtchn, be->blkif->blk_protocol, protocol);
++
++      /* Map the shared frame, irq etc. */
++      err = tap_blkif_map(be->blkif, dev, ring_ref, evtchn);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
++                               ring_ref, evtchn);
++              return err;
++      } 
++
++      return 0;
++}
++
++
++/* ** Driver Registration ** */
++
++
++static const struct xenbus_device_id blktap_ids[] = {
++      { "tap" },
++      { "" }
++};
++
++
++static struct xenbus_driver blktap = {
++      .name = "tap",
++      .ids = blktap_ids,
++      .probe = blktap_probe,
++      .remove = blktap_remove,
++      .otherend_changed = tap_frontend_changed
++};
++
++
++void tap_blkif_xenbus_init(void)
++{
++      if (xenbus_register_backend(&blktap))
++              BUG();
++}
diff --cc drivers/xen/blktap2-new/Makefile

index 0000000,0000000..e196534

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2-new/Makefile
@@@ -1,0 -1,0 +1,4 @@@
++obj-$(CONFIG_XEN_BLKDEV_TAP2) := blktap2-new.o
++
++blktap2-new-y := control.o ring.o device.o request.o
++blktap2-new-$(CONFIG_SYSFS) += sysfs.o
diff --cc drivers/xen/blktap2-new/blktap.h

index 0000000,0000000..05d5bcb

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2-new/blktap.h
@@@ -1,0 -1,0 +1,218 @@@
++#ifndef _BLKTAP_H_
++#define _BLKTAP_H_
++
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/cdev.h>
++#include <linux/init.h>
++#include <linux/scatterlist.h>
++#include <xen/blkif.h>
++
++extern int blktap_debug_level;
++extern int blktap_ring_major;
++extern int blktap_device_major;
++
++#define BTPRINTK(level, tag, force, _f, _a...)                                \
++      do {                                                            \
++              if (blktap_debug_level > level &&                       \
++                  (force || printk_ratelimit()))                      \
++                      printk(tag "%s: " _f, __func__, ##_a);          \
++      } while (0)
++
++#define BTDBG(_f, _a...)             BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
++#define BTINFO(_f, _a...)            BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
++#define BTWARN(_f, _a...)            BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
++#define BTERR(_f, _a...)             BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
++
++#define BLKTAP2_DEV_DIR "xen/blktap-2/"
++
++#define BLKTAP_DEVICE                4
++#define BLKTAP_DEVICE_CLOSED         5
++#define BLKTAP_SHUTDOWN_REQUESTED    8
++
++/* blktap IOCTLs: */
++#define BLKTAP2_IOCTL_KICK_FE        1
++#define BLKTAP2_IOCTL_ALLOC_TAP      200
++#define BLKTAP2_IOCTL_FREE_TAP       201
++#define BLKTAP2_IOCTL_CREATE_DEVICE  202
++#define BLKTAP2_IOCTL_REMOVE_DEVICE  207
++
++#define BLKTAP2_MAX_MESSAGE_LEN      256
++
++#define BLKTAP2_RING_MESSAGE_CLOSE   3
++
++#define BLKTAP_REQUEST_FREE          0
++#define BLKTAP_REQUEST_PENDING       1
++
++/*
++ * The maximum number of requests that can be outstanding at any time
++ * is determined by
++ *
++ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
++ *
++ * where mmap_alloc < MAX_DYNAMIC_MEM.
++ *
++ * TODO:
++ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
++ * sysfs.
++ */
++#define BLK_RING_SIZE         __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
++#define MAX_DYNAMIC_MEM               BLK_RING_SIZE
++#define MAX_PENDING_REQS      BLK_RING_SIZE
++#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
++#define MMAP_VADDR(_start, _req, _seg)                                        \
++        (_start +                                                       \
++         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
++         ((_seg) * PAGE_SIZE))
++
++struct grant_handle_pair {
++      grant_handle_t                 kernel;
++      grant_handle_t                 user;
++};
++#define INVALID_GRANT_HANDLE           0xFFFF
++
++struct blktap_handle {
++      unsigned int                   ring;
++      unsigned int                   device;
++      unsigned int                   minor;
++};
++
++struct blktap_params {
++      char                           name[BLKTAP2_MAX_MESSAGE_LEN];
++      unsigned long long             capacity;
++      unsigned long                  sector_size;
++};
++
++struct blktap_device {
++      spinlock_t                     lock;
++      struct gendisk                *gd;
++};
++
++struct blktap_ring {
++      struct task_struct            *task;
++
++      struct vm_area_struct         *vma;
++      struct blkif_front_ring        ring;
++      unsigned long                  ring_vstart;
++      unsigned long                  user_vstart;
++
++      int                            n_pending;
++      struct blktap_request         *pending[MAX_PENDING_REQS];
++
++      wait_queue_head_t              poll_wait;
++
++      dev_t                          devno;
++      struct device                 *dev;
++};
++
++struct blktap_statistics {
++      unsigned long                  st_print;
++      int                            st_rd_req;
++      int                            st_wr_req;
++      int                            st_oo_req;
++      int                            st_pk_req;
++      int                            st_rd_sect;
++      int                            st_wr_sect;
++      s64                            st_rd_cnt;
++      s64                            st_rd_sum_usecs;
++      s64                            st_rd_max_usecs;
++      s64                            st_wr_cnt;
++      s64                            st_wr_sum_usecs;
++      s64                            st_wr_max_usecs; 
++};
++
++struct blktap_request {
++      struct blktap                 *tap;
++      struct request                *rq;
++      int                            usr_idx;
++
++      int                            operation;
++      struct timeval                 time;
++
++      struct scatterlist             sg_table[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++      struct page                   *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++      int                            nr_pages;
++};
++
++#define blktap_for_each_sg(_sg, _req, _i)     \
++      for (_sg = (_req)->sg_table, _i = 0;    \
++           _i < (_req)->nr_pages;             \
++           (_sg)++, (_i)++)
++
++struct blktap {
++      int                            minor;
++      unsigned long                  dev_inuse;
++
++      struct blktap_ring             ring;
++      struct blktap_device           device;
++      struct blktap_page_pool       *pool;
++
++      wait_queue_head_t              remove_wait;
++      struct work_struct             remove_work;
++      char                           name[BLKTAP2_MAX_MESSAGE_LEN];
++
++      struct blktap_statistics       stats;
++};
++
++struct blktap_page_pool {
++      struct mempool_s              *bufs;
++      spinlock_t                     lock;
++      struct kobject                 kobj;
++      wait_queue_head_t              wait;
++};
++
++extern struct mutex blktap_lock;
++extern struct blktap **blktaps;
++extern int blktap_max_minor;
++
++int blktap_control_destroy_tap(struct blktap *);
++size_t blktap_control_debug(struct blktap *, char *, size_t);
++
++int blktap_ring_init(void);
++void blktap_ring_exit(void);
++size_t blktap_ring_debug(struct blktap *, char *, size_t);
++int blktap_ring_create(struct blktap *);
++int blktap_ring_destroy(struct blktap *);
++struct blktap_request *blktap_ring_make_request(struct blktap *);
++void blktap_ring_free_request(struct blktap *,struct blktap_request *);
++void blktap_ring_submit_request(struct blktap *, struct blktap_request *);
++int blktap_ring_map_request_segment(struct blktap *, struct blktap_request *, int);
++int blktap_ring_map_request(struct blktap *, struct blktap_request *);
++void blktap_ring_unmap_request(struct blktap *, struct blktap_request *);
++void blktap_ring_set_message(struct blktap *, int);
++void blktap_ring_kick_user(struct blktap *);
++
++#ifdef CONFIG_SYSFS
++int blktap_sysfs_init(void);
++void blktap_sysfs_exit(void);
++int blktap_sysfs_create(struct blktap *);
++void blktap_sysfs_destroy(struct blktap *);
++#else
++static inline int blktap_sysfs_init(void) { return 0; }
++static inline void blktap_sysfs_exit(void) {}
++static inline int blktap_sysfs_create(struct blktap *tapdev) { return 0; }
++static inline void blktap_sysfs_destroy(struct blktap *tapdev) {}
++#endif
++
++int blktap_device_init(void);
++void blktap_device_exit(void);
++size_t blktap_device_debug(struct blktap *, char *, size_t);
++int blktap_device_create(struct blktap *, struct blktap_params *);
++int blktap_device_destroy(struct blktap *);
++void blktap_device_destroy_sync(struct blktap *);
++void blktap_device_run_queue(struct blktap *);
++void blktap_device_end_request(struct blktap *, struct blktap_request *, int);
++
++int blktap_page_pool_init(struct kobject *);
++void blktap_page_pool_exit(void);
++struct blktap_page_pool *blktap_page_pool_get(const char *);
++
++size_t blktap_request_debug(struct blktap *, char *, size_t);
++struct blktap_request *blktap_request_alloc(struct blktap *);
++int blktap_request_get_pages(struct blktap *, struct blktap_request *, int);
++void blktap_request_free(struct blktap *, struct blktap_request *);
++void blktap_request_bounce(struct blktap *, struct blktap_request *, int, int);
++
++
++#endif
diff --cc drivers/xen/blktap2-new/control.c

index 0000000,0000000..615df74

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2-new/control.c
@@@ -1,0 -1,0 +1,316 @@@
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/miscdevice.h>
++#include <linux/device.h>
++#include <asm/uaccess.h>
++
++#include "blktap.h"
++
++DEFINE_MUTEX(blktap_lock);
++
++struct blktap **blktaps;
++int blktap_max_minor;
++static struct blktap_page_pool *default_pool;
++
++static struct blktap *
++blktap_control_get_minor(void)
++{
++      int minor;
++      struct blktap *tap;
++
++      tap = kzalloc(sizeof(*tap), GFP_KERNEL);
++      if (unlikely(!tap))
++              return NULL;
++
++      mutex_lock(&blktap_lock);
++
++      for (minor = 0; minor < blktap_max_minor; minor++)
++              if (!blktaps[minor])
++                      break;
++
++      if (minor == CONFIG_XEN_NR_TAP2_DEVICES)
++              goto fail;
++
++      if (minor == blktap_max_minor) {
++              void *p;
++              int n;
++
++              n = min(2 * blktap_max_minor, CONFIG_XEN_NR_TAP2_DEVICES);
++              p = krealloc(blktaps, n * sizeof(blktaps[0]), GFP_KERNEL);
++              if (!p)
++                      goto fail;
++
++              blktaps          = p;
++              minor            = blktap_max_minor;
++              blktap_max_minor = n;
++
++              memset(&blktaps[minor], 0, (n - minor) * sizeof(blktaps[0]));
++      }
++
++      tap->minor = minor;
++      blktaps[minor] = tap;
++
++      __module_get(THIS_MODULE);
++out:
++      mutex_unlock(&blktap_lock);
++      return tap;
++
++fail:
++      mutex_unlock(&blktap_lock);
++      kfree(tap);
++      tap = NULL;
++      goto out;
++}
++
++static void
++blktap_control_put_minor(struct blktap* tap)
++{
++      blktaps[tap->minor] = NULL;
++      kfree(tap);
++
++      module_put(THIS_MODULE);
++}
++
++static struct blktap*
++blktap_control_create_tap(void)
++{
++      struct blktap *tap;
++      int err;
++
++      tap = blktap_control_get_minor();
++      if (!tap)
++              return NULL;
++
++      kobject_get(&default_pool->kobj);
++      tap->pool = default_pool;
++
++      err = blktap_ring_create(tap);
++      if (err)
++              goto fail_tap;
++
++      err = blktap_sysfs_create(tap);
++      if (err)
++              goto fail_ring;
++
++      return tap;
++
++fail_ring:
++      blktap_ring_destroy(tap);
++fail_tap:
++      blktap_control_put_minor(tap);
++
++      return NULL;
++}
++
++int
++blktap_control_destroy_tap(struct blktap *tap)
++{
++      int err;
++
++      err = blktap_ring_destroy(tap);
++      if (err)
++              return err;
++
++      kobject_put(&tap->pool->kobj);
++
++      blktap_sysfs_destroy(tap);
++
++      blktap_control_put_minor(tap);
++
++      return 0;
++}
++
++static long
++blktap_control_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
++{
++      struct blktap *tap;
++
++      switch (cmd) {
++      case BLKTAP2_IOCTL_ALLOC_TAP: {
++              struct blktap_handle h;
++              void __user *ptr = (void __user*)arg;
++
++              tap = blktap_control_create_tap();
++              if (!tap)
++                      return -ENOMEM;
++
++              h.ring   = blktap_ring_major;
++              h.device = blktap_device_major;
++              h.minor  = tap->minor;
++
++              if (copy_to_user(ptr, &h, sizeof(h))) {
++                      blktap_control_destroy_tap(tap);
++                      return -EFAULT;
++              }
++
++              return 0;
++      }
++
++      case BLKTAP2_IOCTL_FREE_TAP: {
++              int minor = arg;
++
++              if (minor > CONFIG_XEN_NR_TAP2_DEVICES)
++                      return -EINVAL;
++
++              tap = blktaps[minor];
++              if (!tap)
++                      return -ENODEV;
++
++              return blktap_control_destroy_tap(tap);
++      }
++      }
++
++      return -ENOIOCTLCMD;
++}
++
++static const struct file_operations blktap_control_file_operations = {
++      .owner    = THIS_MODULE,
++      .unlocked_ioctl = blktap_control_ioctl,
++};
++
++static struct miscdevice blktap_control = {
++      .minor    = MISC_DYNAMIC_MINOR,
++      .name     = "blktap-control",
++      .nodename = BLKTAP2_DEV_DIR "control",
++      .fops     = &blktap_control_file_operations,
++};
++
++static struct device *control_device;
++
++static ssize_t
++blktap_control_show_default_pool(struct device *device,
++                               struct device_attribute *attr,
++                               char *buf)
++{
++      return sprintf(buf, "%s", kobject_name(&default_pool->kobj));
++}
++
++static ssize_t
++blktap_control_store_default_pool(struct device *device,
++                                struct device_attribute *attr,
++                                const char *buf, size_t size)
++{
++      struct blktap_page_pool *pool, *tmp = default_pool;
++
++      pool = blktap_page_pool_get(buf);
++      if (IS_ERR(pool))
++              return PTR_ERR(pool);
++
++      default_pool = pool;
++      kobject_put(&tmp->kobj);
++
++      return size;
++}
++
++static DEVICE_ATTR(default_pool, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
++                 blktap_control_show_default_pool,
++                 blktap_control_store_default_pool);
++
++size_t
++blktap_control_debug(struct blktap *tap, char *buf, size_t size)
++{
++      char *s = buf, *end = buf + size;
++
++      s += snprintf(s, end - s,
++                    "tap %u:%u name:'%s' flags:%#08lx\n",
++                    MAJOR(tap->ring.devno), MINOR(tap->ring.devno),
++                    tap->name, tap->dev_inuse);
++
++      return s - buf;
++}
++
++static int __init
++blktap_control_init(void)
++{
++      int err;
++
++      err = misc_register(&blktap_control);
++      if (err)
++              return err;
++
++      control_device = blktap_control.this_device;
++
++      blktap_max_minor = min(64, CONFIG_XEN_NR_TAP2_DEVICES);
++      blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL);
++      if (!blktaps) {
++              BTERR("failed to allocate blktap minor map");
++              return -ENOMEM;
++      }
++
++      err = blktap_page_pool_init(&control_device->kobj);
++      if (err)
++              return err;
++
++      default_pool = blktap_page_pool_get("default");
++      if (!default_pool)
++              return -ENOMEM;
++
++      err = device_create_file(control_device, &dev_attr_default_pool);
++      if (err)
++              return err;
++
++      return 0;
++}
++
++static void
++blktap_control_exit(void)
++{
++      if (default_pool) {
++              kobject_put(&default_pool->kobj);
++              default_pool = NULL;
++      }
++
++      blktap_page_pool_exit();
++
++      if (blktaps) {
++              kfree(blktaps);
++              blktaps = NULL;
++      }
++
++      if (control_device) {
++              misc_deregister(&blktap_control);
++              control_device = NULL;
++      }
++}
++
++static void
++blktap_exit(void)
++{
++      blktap_control_exit();
++      blktap_ring_exit();
++      blktap_sysfs_exit();
++      blktap_device_exit();
++}
++
++static int __init
++blktap_init(void)
++{
++      int err;
++
++      err = blktap_device_init();
++      if (err)
++              goto fail;
++
++      err = blktap_ring_init();
++      if (err)
++              goto fail;
++
++      err = blktap_sysfs_init();
++      if (err)
++              goto fail;
++
++      err = blktap_control_init();
++      if (err)
++              goto fail;
++
++      return 0;
++
++fail:
++      blktap_exit();
++      return err;
++}
++
++module_init(blktap_init);
++module_exit(blktap_exit);
++MODULE_LICENSE("Dual BSD/GPL");
++MODULE_ALIAS("devname:" BLKTAP2_DEV_DIR "control");
diff --cc drivers/xen/blktap2-new/device.c

index 0000000,0000000..3e62986

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2-new/device.c
@@@ -1,0 -1,0 +1,570 @@@
++#include <linux/fs.h>
++#include <linux/blkdev.h>
++#include <linux/cdrom.h>
++#include <linux/hdreg.h>
++#include <scsi/scsi.h>
++#include <scsi/scsi_ioctl.h>
++
++#include "blktap.h"
++
++int blktap_device_major;
++
++#define dev_to_blktap(_dev) container_of(_dev, struct blktap, device)
++
++static int
++blktap_device_open(struct block_device *bdev, fmode_t mode)
++{
++      struct gendisk *disk = bdev->bd_disk;
++      struct blktap_device *tapdev = disk->private_data;
++
++      if (!tapdev)
++              return -ENXIO;
++
++      /* NB. we might have bounced a bd trylock by tapdisk. when
++       * failing for reasons not !tapdev, make sure to kick tapdisk
++       * out of destroy wait state again. */
++
++      return 0;
++}
++
++static int
++blktap_device_release(struct gendisk *disk, fmode_t mode)
++{
++      struct blktap_device *tapdev = disk->private_data;
++      struct block_device *bdev = bdget_disk(disk, 0);
++      struct blktap *tap = dev_to_blktap(tapdev);
++
++      bdput(bdev);
++
++      if (!bdev->bd_openers) {
++              set_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse);
++              blktap_ring_kick_user(tap);
++      }
++
++      return 0;
++}
++
++static int
++blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
++{
++      /* We don't have real geometry info, but let's at least return
++         values consistent with the size of the device */
++      sector_t nsect = get_capacity(bd->bd_disk);
++      sector_t cylinders = nsect;
++
++      hg->heads = 0xff;
++      hg->sectors = 0x3f;
++      sector_div(cylinders, hg->heads * hg->sectors);
++      hg->cylinders = cylinders;
++      if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
++              hg->cylinders = 0xffff;
++      return 0;
++}
++
++static int
++blktap_device_ioctl(struct block_device *bd, fmode_t mode,
++                  unsigned command, unsigned long argument)
++{
++      int i;
++
++      switch (command) {
++      case CDROMMULTISESSION:
++              BTDBG("FIXME: support multisession CDs later\n");
++              for (i = 0; i < sizeof(struct cdrom_multisession); i++)
++                      if (put_user(0, (char __user *)(argument + i)))
++                              return -EFAULT;
++              return 0;
++
++      case SCSI_IOCTL_GET_IDLUN:
++              if (!access_ok(VERIFY_WRITE, argument, 
++                      sizeof(struct scsi_idlun)))
++                      return -EFAULT;
++
++              /* return 0 for now. */
++              __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
++              __put_user(0, 
++                      &((struct scsi_idlun __user *)argument)->host_unique_id);
++              return 0;
++
++      default:
++              /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
++                command);*/
++              return -EINVAL; /* same return as native Linux */
++      }
++
++      return 0;
++}
++
++static const struct block_device_operations blktap_device_file_operations = {
++      .owner     = THIS_MODULE,
++      .open      = blktap_device_open,
++      .release   = blktap_device_release,
++      .ioctl     = blktap_device_ioctl,
++      .getgeo    = blktap_device_getgeo
++};
++
++/* NB. __blktap holding the queue lock; blktap where unlocked */
++
++static inline struct request*
++__blktap_next_queued_rq(struct request_queue *q)
++{
++      return blk_peek_request(q);
++}
++
++static inline void
++__blktap_dequeue_rq(struct request *rq)
++{
++      blk_start_request(rq);
++}
++
++/* NB. err == 0 indicates success, failures < 0 */
++
++static inline void
++__blktap_end_queued_rq(struct request *rq, int err)
++{
++      blk_start_request(rq);
++      __blk_end_request(rq, err, blk_rq_bytes(rq));
++}
++
++static inline void
++__blktap_end_rq(struct request *rq, int err)
++{
++      __blk_end_request(rq, err, blk_rq_bytes(rq));
++}
++
++static inline void
++blktap_end_rq(struct request *rq, int err)
++{
++      struct request_queue *q = rq->q;
++
++      spin_lock_irq(q->queue_lock);
++      __blktap_end_rq(rq, err);
++      spin_unlock_irq(q->queue_lock);
++}
++
++void
++blktap_device_end_request(struct blktap *tap,
++                        struct blktap_request *request,
++                        int error)
++{
++      struct blktap_device *tapdev = &tap->device;
++      struct request *rq = request->rq;
++
++      blktap_ring_unmap_request(tap, request);
++
++      blktap_ring_free_request(tap, request);
++
++      dev_dbg(disk_to_dev(tapdev->gd),
++              "end_request: op=%d error=%d bytes=%d\n",
++              rq_data_dir(rq), error, blk_rq_bytes(rq));
++
++      blktap_end_rq(rq, error);
++}
++
++int
++blktap_device_make_request(struct blktap *tap, struct request *rq)
++{
++      struct blktap_device *tapdev = &tap->device;
++      struct blktap_request *request;
++      int write, nsegs;
++      int err;
++
++      request = blktap_ring_make_request(tap);
++      if (IS_ERR(request)) {
++              err = PTR_ERR(request);
++              request = NULL;
++
++              if (err == -ENOSPC || err == -ENOMEM)
++                      goto stop;
++
++              goto fail;
++      }
++
++      write = rq_data_dir(rq) == WRITE;
++      nsegs = blk_rq_map_sg(rq->q, rq, request->sg_table);
++
++      dev_dbg(disk_to_dev(tapdev->gd),
++              "make_request: op=%c bytes=%d nsegs=%d\n",
++              write ? 'w' : 'r', blk_rq_bytes(rq), nsegs);
++
++      request->rq = rq;
++      request->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ;
++      if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
++              request->operation = BLKIF_OP_PACKET;
++
++      err = blktap_request_get_pages(tap, request, nsegs);
++      if (err)
++              goto stop;
++
++      err = blktap_ring_map_request(tap, request);
++      if (err)
++              goto fail;
++
++      blktap_ring_submit_request(tap, request);
++
++      return 0;
++
++stop:
++      tap->stats.st_oo_req++;
++      err = -EBUSY;
++
++_out:
++      if (request)
++              blktap_ring_free_request(tap, request);
++
++      return err;
++fail:
++      if (printk_ratelimit())
++              dev_warn(disk_to_dev(tapdev->gd),
++                       "make request: %d, failing\n", err);
++      goto _out;
++}
++
++/*
++ * called from tapdisk context
++ */
++void
++blktap_device_run_queue(struct blktap *tap)
++{
++      struct blktap_device *tapdev = &tap->device;
++      struct request_queue *q;
++      struct request *rq;
++      int err;
++
++      if (!tapdev->gd)
++              return;
++
++      q = tapdev->gd->queue;
++
++      spin_lock_irq(&tapdev->lock);
++      queue_flag_clear(QUEUE_FLAG_STOPPED, q);
++
++      do {
++              rq = __blktap_next_queued_rq(q);
++              if (!rq)
++                      break;
++
++              if (rq->cmd_type != REQ_TYPE_FS) {
++                      __blktap_end_queued_rq(rq, -EOPNOTSUPP);
++                      continue;
++              }
++
++              spin_unlock_irq(&tapdev->lock);
++
++              err = blktap_device_make_request(tap, rq);
++
++              spin_lock_irq(&tapdev->lock);
++
++              if (err == -EBUSY) {
++                      blk_stop_queue(q);
++                      break;
++              }
++
++              __blktap_dequeue_rq(rq);
++
++              if (unlikely(err))
++                      __blktap_end_rq(rq, err);
++      } while (1);
++
++      spin_unlock_irq(&tapdev->lock);
++}
++
++static void
++blktap_device_do_request(struct request_queue *rq)
++{
++      struct blktap_device *tapdev = rq->queuedata;
++      struct blktap *tap = dev_to_blktap(tapdev);
++
++      blktap_ring_kick_user(tap);
++}
++
++static void
++blktap_device_configure(struct blktap *tap,
++                      struct blktap_params *params)
++{
++      struct request_queue *rq;
++      struct blktap_device *dev = &tap->device;
++
++      dev = &tap->device;
++      rq  = dev->gd->queue;
++
++      spin_lock_irq(&dev->lock);
++
++      set_capacity(dev->gd, params->capacity);
++
++      /* Hard sector size and max sectors impersonate the equiv. hardware. */
++      blk_queue_logical_block_size(rq, params->sector_size);
++      blk_queue_max_hw_sectors(rq, 512);
++
++      /* Each segment in a request is up to an aligned page in size. */
++      blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
++      blk_queue_max_segment_size(rq, PAGE_SIZE);
++
++      /* Ensure a merged request will fit in a single I/O ring slot. */
++      blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++
++      /* Make sure buffer addresses are sector-aligned. */
++      blk_queue_dma_alignment(rq, 511);
++
++      spin_unlock_irq(&dev->lock);
++}
++
++static int
++blktap_device_validate_params(struct blktap *tap,
++                            struct blktap_params *params)
++{
++      struct device *dev = tap->ring.dev;
++      int sector_order, name_sz;
++
++      sector_order = ffs(params->sector_size) - 1;
++
++      if (sector_order <  9 ||
++          sector_order > 12 ||
++          params->sector_size != 1U<<sector_order)
++              goto fail;
++
++      if (!params->capacity ||
++          (params->capacity > ULLONG_MAX >> sector_order))
++              goto fail;
++
++      name_sz = min(sizeof(params->name), sizeof(tap->name));
++      if (strnlen(params->name, name_sz) >= name_sz)
++              goto fail;
++
++      return 0;
++
++fail:
++      params->name[name_sz-1] = 0;
++      dev_err(dev, "capacity: %llu, sector-size: %lu, name: %s\n",
++              params->capacity, params->sector_size, params->name);
++      return -EINVAL;
++}
++
++int
++blktap_device_destroy(struct blktap *tap)
++{
++      struct blktap_device *tapdev = &tap->device;
++      struct block_device *bdev;
++      struct gendisk *gd;
++      int err;
++
++      gd = tapdev->gd;
++      if (!gd)
++              return 0;
++
++      bdev = bdget_disk(gd, 0);
++
++      err = !mutex_trylock(&bdev->bd_mutex);
++      if (err) {
++              /* NB. avoid a deadlock. the last opener syncs the
++               * bdev holding bd_mutex. */
++              err = -EBUSY;
++              goto out_nolock;
++      }
++
++      if (bdev->bd_openers) {
++              err = -EBUSY;
++              goto out;
++      }
++
++      del_gendisk(gd);
++      gd->private_data = NULL;
++
++      blk_cleanup_queue(gd->queue);
++
++      put_disk(gd);
++      tapdev->gd = NULL;
++
++      clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++      err = 0;
++out:
++      mutex_unlock(&bdev->bd_mutex);
++out_nolock:
++      bdput(bdev);
++
++      return err;
++}
++
++static void
++blktap_device_fail_queue(struct blktap *tap)
++{
++      struct blktap_device *tapdev = &tap->device;
++      struct request_queue *q = tapdev->gd->queue;
++
++      spin_lock_irq(&tapdev->lock);
++      queue_flag_clear(QUEUE_FLAG_STOPPED, q);
++
++      do {
++              struct request *rq = __blktap_next_queued_rq(q);
++              if (!rq)
++                      break;
++
++              __blktap_end_queued_rq(rq, -EIO);
++      } while (1);
++
++      spin_unlock_irq(&tapdev->lock);
++}
++
++static int
++blktap_device_try_destroy(struct blktap *tap)
++{
++      int err;
++
++      err = blktap_device_destroy(tap);
++      if (err)
++              blktap_device_fail_queue(tap);
++
++      return err;
++}
++
++void
++blktap_device_destroy_sync(struct blktap *tap)
++{
++      wait_event(tap->ring.poll_wait,
++                 !blktap_device_try_destroy(tap));
++}
++
++static char *blktap_devnode(struct gendisk *gd, mode_t *mode)
++{
++      return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "tapdev%u",
++                       gd->first_minor);
++}
++
++int
++blktap_device_create(struct blktap *tap, struct blktap_params *params)
++{
++      int minor, err;
++      struct gendisk *gd;
++      struct request_queue *rq;
++      struct blktap_device *tapdev;
++
++      gd     = NULL;
++      rq     = NULL;
++      tapdev = &tap->device;
++      minor  = tap->minor;
++
++      if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++              return -EEXIST;
++
++      if (blktap_device_validate_params(tap, params))
++              return -EINVAL;
++
++      gd = alloc_disk(1);
++      if (!gd) {
++              err = -ENOMEM;
++              goto fail;
++      }
++
++      if (minor < 26) {
++              sprintf(gd->disk_name, "td%c", 'a' + minor % 26);
++      } else if (minor < (26 + 1) * 26) {
++              sprintf(gd->disk_name, "td%c%c",
++                      'a' + minor / 26 - 1,'a' + minor % 26);
++      } else {
++              const unsigned int m1 = (minor / 26 - 1) / 26 - 1;
++              const unsigned int m2 = (minor / 26 - 1) % 26;
++              const unsigned int m3 =  minor % 26;
++              sprintf(gd->disk_name, "td%c%c%c",
++                      'a' + m1, 'a' + m2, 'a' + m3);
++      }
++
++      gd->major = blktap_device_major;
++      gd->first_minor = minor;
++      gd->devnode = blktap_devnode;
++      gd->fops = &blktap_device_file_operations;
++      gd->private_data = tapdev;
++
++      spin_lock_init(&tapdev->lock);
++      rq = blk_init_queue(blktap_device_do_request, &tapdev->lock);
++      if (!rq) {
++              err = -ENOMEM;
++              goto fail;
++      }
++      elevator_init(rq, "noop");
++
++      gd->queue     = rq;
++      rq->queuedata = tapdev;
++      tapdev->gd    = gd;
++
++      blktap_device_configure(tap, params);
++      add_disk(gd);
++
++      if (params->name[0])
++              strncpy(tap->name, params->name, sizeof(tap->name)-1);
++
++      set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++
++      dev_info(disk_to_dev(gd), "sector-size: %u capacity: %llu\n",
++               queue_logical_block_size(rq),
++               (unsigned long long)get_capacity(gd));
++
++      return 0;
++
++fail:
++      if (gd)
++              del_gendisk(gd);
++      if (rq)
++              blk_cleanup_queue(rq);
++
++      return err;
++}
++
++size_t
++blktap_device_debug(struct blktap *tap, char *buf, size_t size)
++{
++      struct gendisk *disk = tap->device.gd;
++      struct request_queue *q;
++      struct block_device *bdev;
++      char *s = buf, *end = buf + size;
++
++      if (!disk)
++              return 0;
++
++      q = disk->queue;
++
++      s += snprintf(s, end - s,
++                    "disk capacity:%llu sector size:%u\n",
++                    (unsigned long long)get_capacity(disk),
++                    queue_logical_block_size(q));
++
++      s += snprintf(s, end - s,
++                    "queue flags:%#lx stopped:%d\n",
++                    q->queue_flags,
++                    blk_queue_stopped(q));
++
++      bdev = bdget_disk(disk, 0);
++      if (bdev) {
++              s += snprintf(s, end - s,
++                            "bdev openers:%d closed:%d\n",
++                            bdev->bd_openers,
++                            test_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse));
++              bdput(bdev);
++      }
++
++      return s - buf;
++}
++
++int __init
++blktap_device_init()
++{
++      int major;
++
++      /* Dynamically allocate a major for this device */
++      major = register_blkdev(0, "tapdev");
++      if (major < 0) {
++              BTERR("Couldn't register blktap device\n");
++              return -ENOMEM;
++      }
++
++      blktap_device_major = major;
++      BTINFO("blktap device major %d\n", major);
++
++      return 0;
++}
++
++void
++blktap_device_exit(void)
++{
++      if (blktap_device_major)
++              unregister_blkdev(blktap_device_major, "tapdev");
++}
diff --cc drivers/xen/blktap2-new/request.c

index 0000000,0000000..9bef48c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2-new/request.c
@@@ -1,0 -1,0 +1,418 @@@
++#include <linux/mempool.h>
++#include <linux/spinlock.h>
++#include <linux/mutex.h>
++#include <linux/sched.h>
++#include <linux/device.h>
++
++#include "blktap.h"
++
++/* max pages per shared pool. just to prevent accidental dos. */
++#define POOL_MAX_PAGES           (256*BLKIF_MAX_SEGMENTS_PER_REQUEST)
++
++/* default page pool size. when considering to shrink a shared pool,
++ * note that paused tapdisks may grab a whole lot of pages for a long
++ * time. */
++#define POOL_DEFAULT_PAGES       (2 * MMAP_PAGES)
++
++/* max number of pages allocatable per request. */
++#define POOL_MAX_REQUEST_PAGES   BLKIF_MAX_SEGMENTS_PER_REQUEST
++
++/* min request structs per pool. These grow dynamically. */
++#define POOL_MIN_REQS            BLK_RING_SIZE
++
++static struct kset *pool_set;
++
++#define kobj_to_pool(_kobj) \
++      container_of(_kobj, struct blktap_page_pool, kobj)
++
++static struct kmem_cache *request_cache;
++static mempool_t *request_pool;
++
++static void
++__page_pool_wake(struct blktap_page_pool *pool)
++{
++      mempool_t *mem = pool->bufs;
++
++      /*
++        NB. slightly wasteful to always wait for a full segment
++        set. but this ensures the next disk makes
++        progress. presently, the repeated request struct
++        alloc/release cycles would otherwise keep everyone spinning.
++      */
++
++      if (mem->curr_nr >= POOL_MAX_REQUEST_PAGES)
++              wake_up(&pool->wait);
++}
++
++int
++blktap_request_get_pages(struct blktap *tap,
++                       struct blktap_request *request, int nr_pages)
++{
++      struct blktap_page_pool *pool = tap->pool;
++      mempool_t *mem = pool->bufs;
++      struct page *page;
++
++      BUG_ON(request->nr_pages != 0);
++      BUG_ON(nr_pages > POOL_MAX_REQUEST_PAGES);
++
++      if (mem->curr_nr < nr_pages)
++              return -ENOMEM;
++
++      /* NB. avoid thundering herds of tapdisks colliding. */
++      spin_lock(&pool->lock);
++
++      if (mem->curr_nr < nr_pages) {
++              spin_unlock(&pool->lock);
++              return -ENOMEM;
++      }
++
++      while (request->nr_pages < nr_pages) {
++              page = mempool_alloc(mem, GFP_NOWAIT);
++              BUG_ON(!page);
++              request->pages[request->nr_pages++] = page;
++      }
++
++      spin_unlock(&pool->lock);
++
++      return 0;
++}
++
++static void
++blktap_request_put_pages(struct blktap *tap,
++                       struct blktap_request *request)
++{
++      struct blktap_page_pool *pool = tap->pool;
++      struct page *page;
++
++      while (request->nr_pages) {
++              page = request->pages[--request->nr_pages];
++              mempool_free(page, pool->bufs);
++      }
++}
++
++size_t
++blktap_request_debug(struct blktap *tap, char *buf, size_t size)
++{
++      struct blktap_page_pool *pool = tap->pool;
++      mempool_t *mem = pool->bufs;
++      char *s = buf, *end = buf + size;
++
++      s += snprintf(buf, end - s,
++                    "pool:%s pages:%d free:%d\n",
++                    kobject_name(&pool->kobj),
++                    mem->min_nr, mem->curr_nr);
++
++      return s - buf;
++}
++
++struct blktap_request*
++blktap_request_alloc(struct blktap *tap)
++{
++      struct blktap_request *request;
++
++      request = mempool_alloc(request_pool, GFP_NOWAIT);
++      if (request)
++              request->tap = tap;
++
++      return request;
++}
++
++void
++blktap_request_free(struct blktap *tap,
++                  struct blktap_request *request)
++{
++      blktap_request_put_pages(tap, request);
++
++      mempool_free(request, request_pool);
++
++      __page_pool_wake(tap->pool);
++}
++
++void
++blktap_request_bounce(struct blktap *tap,
++                    struct blktap_request *request,
++                    int seg, int write)
++{
++      struct scatterlist *sg = &request->sg_table[seg];
++      void *s, *p;
++
++      BUG_ON(seg >= request->nr_pages);
++
++      s = sg_virt(sg);
++      p = page_address(request->pages[seg]) + sg->offset;
++
++      if (write)
++              memcpy(p, s, sg->length);
++      else
++              memcpy(s, p, sg->length);
++}
++
++static void
++blktap_request_ctor(void *obj)
++{
++      struct blktap_request *request = obj;
++
++      memset(request, 0, sizeof(*request));
++      sg_init_table(request->sg_table, ARRAY_SIZE(request->sg_table));
++}
++
++static int
++blktap_page_pool_resize(struct blktap_page_pool *pool, int target)
++{
++      mempool_t *bufs = pool->bufs;
++      int err;
++
++      /* NB. mempool asserts min_nr >= 1 */
++      target = max(1, target);
++
++      err = mempool_resize(bufs, target, GFP_KERNEL);
++      if (err)
++              return err;
++
++      __page_pool_wake(pool);
++
++      return 0;
++}
++
++struct pool_attribute {
++      struct attribute attr;
++
++      ssize_t (*show)(struct blktap_page_pool *pool,
++                      char *buf);
++
++      ssize_t (*store)(struct blktap_page_pool *pool,
++                       const char *buf, size_t count);
++};
++
++#define kattr_to_pool_attr(_kattr) \
++      container_of(_kattr, struct pool_attribute, attr)
++
++static ssize_t
++blktap_page_pool_show_size(struct blktap_page_pool *pool,
++                         char *buf)
++{
++      mempool_t *mem = pool->bufs;
++      return sprintf(buf, "%d", mem->min_nr);
++}
++
++static ssize_t
++blktap_page_pool_store_size(struct blktap_page_pool *pool,
++                          const char *buf, size_t size)
++{
++      int target;
++
++      /*
++       * NB. target fixup to avoid undesired results. less than a
++       * full segment set can wedge the disk. much more than a
++       * couple times the physical queue depth is rarely useful.
++       */
++
++      target = simple_strtoul(buf, NULL, 0);
++      target = max(POOL_MAX_REQUEST_PAGES, target);
++      target = min(target, POOL_MAX_PAGES);
++
++      return blktap_page_pool_resize(pool, target) ? : size;
++}
++
++static struct pool_attribute blktap_page_pool_attr_size =
++      __ATTR(size, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
++             blktap_page_pool_show_size,
++             blktap_page_pool_store_size);
++
++static ssize_t
++blktap_page_pool_show_free(struct blktap_page_pool *pool,
++                         char *buf)
++{
++      mempool_t *mem = pool->bufs;
++      return sprintf(buf, "%d", mem->curr_nr);
++}
++
++static struct pool_attribute blktap_page_pool_attr_free =
++      __ATTR(free, S_IRUSR|S_IRGRP|S_IROTH,
++             blktap_page_pool_show_free,
++             NULL);
++
++static struct attribute *blktap_page_pool_attrs[] = {
++      &blktap_page_pool_attr_size.attr,
++      &blktap_page_pool_attr_free.attr,
++      NULL,
++};
++
++static inline struct kobject*
++__blktap_kset_find_obj(struct kset *kset, const char *name)
++{
++      struct kobject *k;
++      struct kobject *ret = NULL;
++
++      spin_lock(&kset->list_lock);
++      list_for_each_entry(k, &kset->list, entry) {
++              if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
++                      ret = kobject_get(k);
++                      break;
++              }
++      }
++      spin_unlock(&kset->list_lock);
++      return ret;
++}
++
++static ssize_t
++blktap_page_pool_show_attr(struct kobject *kobj, struct attribute *kattr,
++                         char *buf)
++{
++      struct blktap_page_pool *pool = kobj_to_pool(kobj);
++      struct pool_attribute *attr = kattr_to_pool_attr(kattr);
++
++      if (attr->show)
++              return attr->show(pool, buf);
++
++      return -EIO;
++}
++
++static ssize_t
++blktap_page_pool_store_attr(struct kobject *kobj, struct attribute *kattr,
++                          const char *buf, size_t size)
++{
++      struct blktap_page_pool *pool = kobj_to_pool(kobj);
++      struct pool_attribute *attr = kattr_to_pool_attr(kattr);
++
++      if (attr->show)
++              return attr->store(pool, buf, size);
++
++      return -EIO;
++}
++
++static struct sysfs_ops blktap_page_pool_sysfs_ops = {
++      .show           = blktap_page_pool_show_attr,
++      .store          = blktap_page_pool_store_attr,
++};
++
++static void
++blktap_page_pool_release(struct kobject *kobj)
++{
++      struct blktap_page_pool *pool = kobj_to_pool(kobj);
++      mempool_destroy(pool->bufs);
++      kfree(pool);
++}
++
++struct kobj_type blktap_page_pool_ktype = {
++      .release       = blktap_page_pool_release,
++      .sysfs_ops     = &blktap_page_pool_sysfs_ops,
++      .default_attrs = blktap_page_pool_attrs,
++};
++
++static void*
++__mempool_page_alloc(gfp_t gfp_mask, void *pool_data)
++{
++      struct page *page;
++
++      if (!(gfp_mask & __GFP_WAIT))
++              return NULL;
++
++      page = alloc_page(gfp_mask);
++      if (page)
++              SetPageReserved(page);
++
++      return page;
++}
++
++static void
++__mempool_page_free(void *element, void *pool_data)
++{
++      struct page *page = element;
++
++      ClearPageReserved(page);
++      put_page(page);
++}
++
++static struct kobject*
++blktap_page_pool_create(const char *name, int nr_pages)
++{
++      struct blktap_page_pool *pool;
++      int err;
++
++      pool = kzalloc(sizeof(*pool), GFP_KERNEL);
++      if (!pool)
++              goto fail;
++
++      spin_lock_init(&pool->lock);
++      init_waitqueue_head(&pool->wait);
++
++      pool->bufs = mempool_create(nr_pages,
++                                  __mempool_page_alloc, __mempool_page_free,
++                                  pool);
++      if (!pool->bufs)
++              goto fail_pool;
++
++      kobject_init(&pool->kobj, &blktap_page_pool_ktype);
++      pool->kobj.kset = pool_set;
++      err = kobject_add(&pool->kobj, &pool_set->kobj, "%s", name);
++      if (err)
++              goto fail_bufs;
++
++      return &pool->kobj;
++
++      kobject_del(&pool->kobj);
++fail_bufs:
++      mempool_destroy(pool->bufs);
++fail_pool:
++      kfree(pool);
++fail:
++      return NULL;
++}
++
++struct blktap_page_pool*
++blktap_page_pool_get(const char *name)
++{
++      struct kobject *kobj;
++
++      kobj = __blktap_kset_find_obj(pool_set, name);
++      if (!kobj)
++              kobj = blktap_page_pool_create(name,
++                                             POOL_DEFAULT_PAGES);
++      if (!kobj)
++              return ERR_PTR(-ENOMEM);
++
++      return kobj_to_pool(kobj);
++}
++
++int __init
++blktap_page_pool_init(struct kobject *parent)
++{
++      request_cache =
++              kmem_cache_create("blktap-request",
++                                sizeof(struct blktap_request), 0,
++                                0, blktap_request_ctor);
++      if (!request_cache)
++              return -ENOMEM;
++
++      request_pool =
++              mempool_create_slab_pool(POOL_MIN_REQS, request_cache);
++      if (!request_pool)
++              return -ENOMEM;
++
++      pool_set = kset_create_and_add("pools", NULL, parent);
++      if (!pool_set)
++              return -ENOMEM;
++
++      return 0;
++}
++
++void
++blktap_page_pool_exit(void)
++{
++      if (pool_set) {
++              BUG_ON(!list_empty(&pool_set->list));
++              kset_unregister(pool_set);
++              pool_set = NULL;
++      }
++
++      if (request_pool) {
++              mempool_destroy(request_pool);
++              request_pool = NULL;
++      }
++
++      if (request_cache) {
++              kmem_cache_destroy(request_cache);
++              request_cache = NULL;
++      }
++}
diff --cc drivers/xen/blktap2-new/ring.c

index 0000000,0000000..bdf194b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2-new/ring.c
@@@ -1,0 -1,0 +1,547 @@@
++
++#include <linux/device.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/poll.h>
++#include <linux/blkdev.h>
++
++#include "blktap.h"
++
++int blktap_ring_major;
++
++ /* 
++  * BLKTAP - immediately before the mmap area,
++  * we have a bunch of pages reserved for shared memory rings.
++  */
++#define RING_PAGES 1
++
++static void
++blktap_ring_read_response(struct blktap *tap,
++                   const struct blkif_response *rsp)
++{
++      struct blktap_ring *ring = &tap->ring;
++      struct blktap_request *request;
++      int usr_idx, err;
++
++      request = NULL;
++
++      usr_idx = rsp->id;
++      if (usr_idx < 0 || usr_idx >= MAX_PENDING_REQS) {
++              err = -ERANGE;
++              goto invalid;
++      }
++
++      request = ring->pending[usr_idx];
++
++      if (!request) {
++              err = -ESRCH;
++              goto invalid;
++      }
++
++      if (rsp->operation != request->operation) {
++              err = -EINVAL;
++              goto invalid;
++      }
++
++      dev_dbg(ring->dev,
++              "request %d [%p] response: %d\n",
++              request->usr_idx, request, rsp->status);
++
++      err = rsp->status == BLKIF_RSP_OKAY ? 0 : -EIO;
++end_request:
++      blktap_device_end_request(tap, request, err);
++      return;
++
++invalid:
++      dev_warn(ring->dev,
++               "invalid response, idx:%d status:%d op:%d/%d: err %d\n",
++               usr_idx, rsp->status,
++               rsp->operation, request->operation,
++               err);
++      if (request)
++              goto end_request;
++}
++
++static void
++blktap_read_ring(struct blktap *tap)
++{
++      struct blktap_ring *ring = &tap->ring;
++      struct blkif_response rsp;
++      RING_IDX rc, rp;
++
++      down_read(&current->mm->mmap_sem);
++      if (!ring->vma) {
++              up_read(&current->mm->mmap_sem);
++              return;
++      }
++
++      /* for each outstanding message on the ring  */
++      rp = ring->ring.sring->rsp_prod;
++      rmb();
++
++      for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
++              memcpy(&rsp, RING_GET_RESPONSE(&ring->ring, rc), sizeof(rsp));
++              blktap_ring_read_response(tap, &rsp);
++      }
++
++      ring->ring.rsp_cons = rc;
++
++      up_read(&current->mm->mmap_sem);
++}
++
++static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++      return VM_FAULT_SIGBUS;
++}
++
++static void
++blktap_ring_fail_pending(struct blktap *tap)
++{
++      struct blktap_ring *ring = &tap->ring;
++      struct blktap_request *request;
++      int usr_idx;
++
++      for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
++              request = ring->pending[usr_idx];
++              if (!request)
++                      continue;
++
++              blktap_device_end_request(tap, request, -EIO);
++      }
++}
++
++static void
++blktap_ring_vm_close(struct vm_area_struct *vma)
++{
++      struct blktap *tap = vma->vm_private_data;
++      struct blktap_ring *ring = &tap->ring;
++      struct page *page = virt_to_page(ring->ring.sring);
++
++      blktap_ring_fail_pending(tap);
++
++      zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
++      ClearPageReserved(page);
++      __free_page(page);
++
++      ring->vma = NULL;
++
++      if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++              blktap_control_destroy_tap(tap);
++}
++
++static struct vm_operations_struct blktap_ring_vm_operations = {
++      .close    = blktap_ring_vm_close,
++      .fault    = blktap_ring_fault,
++};
++
++int
++blktap_ring_map_segment(struct blktap *tap,
++                      struct blktap_request *request,
++                      int seg)
++{
++      struct blktap_ring *ring = &tap->ring;
++      unsigned long uaddr;
++
++      uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
++      return vm_insert_page(ring->vma, uaddr, request->pages[seg]);
++}
++
++int
++blktap_ring_map_request(struct blktap *tap,
++                      struct blktap_request *request)
++{
++      int seg, err = 0;
++      int write;
++
++      write = request->operation != BLKIF_OP_READ;
++
++      for (seg = 0; seg < request->nr_pages; seg++) {
++              if (write)
++                      blktap_request_bounce(tap, request, seg, 1);
++
++              err = blktap_ring_map_segment(tap, request, seg);
++              if (err)
++                      break;
++      }
++
++      if (err)
++              blktap_ring_unmap_request(tap, request);
++
++      return err;
++}
++
++void
++blktap_ring_unmap_request(struct blktap *tap,
++                        struct blktap_request *request)
++{
++      struct blktap_ring *ring = &tap->ring;
++      unsigned long uaddr;
++      unsigned size;
++      int seg, read;
++
++      uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0);
++      size  = request->nr_pages << PAGE_SHIFT;
++      read  = request->operation != BLKIF_OP_WRITE;
++
++      if (read)
++              for (seg = 0; seg < request->nr_pages; seg++)
++                      blktap_request_bounce(tap, request, seg, 0);
++
++      zap_page_range(ring->vma, uaddr, size, NULL);
++}
++
++void
++blktap_ring_free_request(struct blktap *tap,
++                       struct blktap_request *request)
++{
++      struct blktap_ring *ring = &tap->ring;
++
++      ring->pending[request->usr_idx] = NULL;
++      ring->n_pending--;
++
++      blktap_request_free(tap, request);
++}
++
++struct blktap_request*
++blktap_ring_make_request(struct blktap *tap)
++{
++      struct blktap_ring *ring = &tap->ring;
++      struct blktap_request *request;
++      int usr_idx;
++
++      if (RING_FULL(&ring->ring))
++              return ERR_PTR(-ENOSPC);
++
++      request = blktap_request_alloc(tap);
++      if (!request)
++              return ERR_PTR(-ENOMEM);
++
++      for (usr_idx = 0; usr_idx < BLK_RING_SIZE; usr_idx++)
++              if (!ring->pending[usr_idx])
++                      break;
++
++      BUG_ON(usr_idx >= BLK_RING_SIZE);
++
++      request->tap     = tap;
++      request->usr_idx = usr_idx;
++
++      ring->pending[usr_idx] = request;
++      ring->n_pending++;
++
++      return request;
++}
++
++void
++blktap_ring_submit_request(struct blktap *tap,
++                         struct blktap_request *request)
++{
++      struct blktap_ring *ring = &tap->ring;
++      struct blkif_request *breq;
++      struct scatterlist *sg;
++      int i, nsecs = 0;
++
++      dev_dbg(ring->dev,
++              "request %d [%p] submit\n", request->usr_idx, request);
++
++      breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
++
++      breq->id            = request->usr_idx;
++      breq->sector_number = blk_rq_pos(request->rq);
++      breq->handle        = 0;
++      breq->operation     = request->operation;
++      breq->nr_segments   = request->nr_pages;
++
++      blktap_for_each_sg(sg, request, i) {
++              struct blkif_request_segment *seg = &breq->seg[i];
++              int first, count;
++
++              count = sg->length >> 9;
++              first = sg->offset >> 9;
++
++              seg->first_sect = first;
++              seg->last_sect  = first + count - 1;
++
++              nsecs += count;
++      }
++
++      ring->ring.req_prod_pvt++;
++
++      do_gettimeofday(&request->time);
++
++
++      switch (request->operation) {
++      case BLKIF_OP_WRITE:
++              tap->stats.st_wr_sect += nsecs;
++              tap->stats.st_wr_req++;
++              break;
++
++      case BLKIF_OP_READ:
++              tap->stats.st_rd_sect += nsecs;
++              tap->stats.st_rd_req++;
++              break;
++
++      case BLKIF_OP_PACKET:
++              tap->stats.st_pk_req++;
++              break;
++      }
++}
++
++static int
++blktap_ring_open(struct inode *inode, struct file *filp)
++{
++      struct blktap *tap = NULL;
++      int minor;
++
++      minor = iminor(inode);
++
++      if (minor < blktap_max_minor)
++              tap = blktaps[minor];
++
++      if (!tap)
++              return -ENXIO;
++
++      if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++              return -ENXIO;
++
++      if (tap->ring.task)
++              return -EBUSY;
++
++      filp->private_data = tap;
++      tap->ring.task = current;
++
++      return 0;
++}
++
++static int
++blktap_ring_release(struct inode *inode, struct file *filp)
++{
++      struct blktap *tap = filp->private_data;
++
++      blktap_device_destroy_sync(tap);
++
++      tap->ring.task = NULL;
++
++      if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++              blktap_control_destroy_tap(tap);
++
++      return 0;
++}
++
++static int
++blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
++{
++      struct blktap *tap = filp->private_data;
++      struct blktap_ring *ring = &tap->ring;
++      struct blkif_sring *sring;
++      struct page *page = NULL;
++      int err;
++
++      if (ring->vma)
++              return -EBUSY;
++
++      page = alloc_page(GFP_KERNEL|__GFP_ZERO);
++      if (!page)
++              return -ENOMEM;
++
++      SetPageReserved(page);
++
++      err = vm_insert_page(vma, vma->vm_start, page);
++      if (err)
++              goto fail;
++
++      sring = page_address(page);
++      SHARED_RING_INIT(sring);
++      FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
++
++      ring->ring_vstart = vma->vm_start;
++      ring->user_vstart = ring->ring_vstart + PAGE_SIZE;
++
++      vma->vm_private_data = tap;
++
++      vma->vm_flags |= VM_DONTCOPY;
++      vma->vm_flags |= VM_RESERVED;
++
++      vma->vm_ops = &blktap_ring_vm_operations;
++
++      ring->vma = vma;
++      return 0;
++
++fail:
++      if (page) {
++              zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
++              ClearPageReserved(page);
++              __free_page(page);
++      }
++
++      return err;
++}
++
++static long
++blktap_ring_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
++{
++      struct blktap *tap = filp->private_data;
++      struct blktap_ring *ring = &tap->ring;
++
++      BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
++
++      if (!ring->vma || ring->vma->vm_mm != current->mm)
++              return -EACCES;
++
++      switch(cmd) {
++      case BLKTAP2_IOCTL_KICK_FE:
++
++              blktap_read_ring(tap);
++              return 0;
++
++      case BLKTAP2_IOCTL_CREATE_DEVICE: {
++              struct blktap_params params;
++              void __user *ptr = (void *)arg;
++
++              if (!arg)
++                      return -EINVAL;
++
++              if (copy_from_user(&params, ptr, sizeof(params)))
++                      return -EFAULT;
++
++              return blktap_device_create(tap, &params);
++      }
++
++      case BLKTAP2_IOCTL_REMOVE_DEVICE:
++
++              return blktap_device_destroy(tap);
++      }
++
++      return -ENOIOCTLCMD;
++}
++
++static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
++{
++      struct blktap *tap = filp->private_data;
++      struct blktap_ring *ring = &tap->ring;
++      int work;
++
++      poll_wait(filp, &tap->pool->wait, wait);
++      poll_wait(filp, &ring->poll_wait, wait);
++
++      down_read(&current->mm->mmap_sem);
++      if (ring->vma && tap->device.gd)
++              blktap_device_run_queue(tap);
++      up_read(&current->mm->mmap_sem);
++
++      work = ring->ring.req_prod_pvt - ring->ring.sring->req_prod;
++      RING_PUSH_REQUESTS(&ring->ring);
++
++      if (work ||
++          ring->ring.sring->private.tapif_user.msg ||
++          test_and_clear_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse))
++              return POLLIN | POLLRDNORM;
++
++      return 0;
++}
++
++static const struct file_operations blktap_ring_file_operations = {
++      .owner    = THIS_MODULE,
++      .open     = blktap_ring_open,
++      .release  = blktap_ring_release,
++      .unlocked_ioctl = blktap_ring_ioctl,
++      .mmap     = blktap_ring_mmap,
++      .poll     = blktap_ring_poll,
++};
++
++void
++blktap_ring_kick_user(struct blktap *tap)
++{
++      wake_up(&tap->ring.poll_wait);
++}
++
++int
++blktap_ring_destroy(struct blktap *tap)
++{
++      struct blktap_ring *ring = &tap->ring;
++
++      if (ring->task || ring->vma)
++              return -EBUSY;
++
++      return 0;
++}
++
++int
++blktap_ring_create(struct blktap *tap)
++{
++      struct blktap_ring *ring = &tap->ring;
++
++      init_waitqueue_head(&ring->poll_wait);
++      ring->devno = MKDEV(blktap_ring_major, tap->minor);
++
++      return 0;
++}
++
++size_t
++blktap_ring_debug(struct blktap *tap, char *buf, size_t size)
++{
++      struct blktap_ring *ring = &tap->ring;
++      char *s = buf, *end = buf + size;
++      int usr_idx;
++
++      s += snprintf(s, end - s,
++                    "begin pending:%d\n", ring->n_pending);
++
++      for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
++              struct blktap_request *request;
++              struct timeval *time;
++              char op = '?';
++
++              request = ring->pending[usr_idx];
++              if (!request)
++                      continue;
++
++              switch (request->operation) {
++              case BLKIF_OP_WRITE:  op = 'W'; break;
++              case BLKIF_OP_READ:   op = 'R'; break;
++              case BLKIF_OP_PACKET: op = 'P'; break;
++              }
++              time  = &request->time;
++
++              s += snprintf(s, end - s,
++                            "%02d: usr_idx:%02d "
++                            "op:%c nr_pages:%02d time:%lu.%09lu\n",
++                            usr_idx, request->usr_idx,
++                            op, request->nr_pages,
++                            time->tv_sec, time->tv_usec);
++      }
++
++      s += snprintf(s, end - s, "end pending\n");
++
++      return s - buf;
++}
++
++
++int __init
++blktap_ring_init(void)
++{
++      int err;
++
++      err = __register_chrdev(0, 0, CONFIG_XEN_NR_TAP2_DEVICES, "blktap2",
++                              &blktap_ring_file_operations);
++      if (err < 0) {
++              BTERR("error registering ring devices: %d\n", err);
++              return err;
++      }
++
++      blktap_ring_major = err;
++      BTINFO("blktap ring major: %d\n", blktap_ring_major);
++
++      return 0;
++}
++
++void
++blktap_ring_exit(void)
++{
++      if (!blktap_ring_major)
++              return;
++
++      __unregister_chrdev(blktap_ring_major, 0, CONFIG_XEN_NR_TAP2_DEVICES,
++                          "blktap2");
++
++      blktap_ring_major = 0;
++}
diff --cc drivers/xen/blktap2-new/sysfs.c

index 0000000,0000000..056b0dc

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2-new/sysfs.c
@@@ -1,0 -1,0 +1,299 @@@
++#include <linux/types.h>
++#include <linux/device.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/genhd.h>
++#include <linux/blkdev.h>
++
++#include "blktap.h"
++
++int blktap_debug_level = 1;
++
++static struct class *class;
++
++static ssize_t
++blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
++{
++      struct blktap *tap;
++
++      tap = dev_get_drvdata(dev);
++      if (!tap)
++              return 0;
++
++      if (size >= BLKTAP2_MAX_MESSAGE_LEN)
++              return -ENAMETOOLONG;
++
++      if (strnlen(buf, size) != size)
++              return -EINVAL;
++
++      strcpy(tap->name, buf);
++
++      return size;
++}
++
++static ssize_t
++blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
++{
++      struct blktap *tap;
++      ssize_t size;
++
++      tap = dev_get_drvdata(dev);
++      if (!tap)
++              return 0;
++
++      if (tap->name[0])
++              size = sprintf(buf, "%s\n", tap->name);
++      else
++              size = sprintf(buf, "%d\n", tap->minor);
++
++      return size;
++}
++static DEVICE_ATTR(name, S_IRUGO|S_IWUSR,
++                 blktap_sysfs_get_name, blktap_sysfs_set_name);
++
++static void
++blktap_sysfs_remove_work(struct work_struct *work)
++{
++      struct blktap *tap
++              = container_of(work, struct blktap, remove_work);
++      blktap_control_destroy_tap(tap);
++}
++
++static ssize_t
++blktap_sysfs_remove_device(struct device *dev,
++                         struct device_attribute *attr,
++                         const char *buf, size_t size)
++{
++      struct blktap *tap;
++      int err;
++
++      tap = dev_get_drvdata(dev);
++      if (!tap)
++              return size;
++
++      if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++              goto wait;
++
++      if (tap->ring.vma) {
++              struct blkif_sring *sring = tap->ring.ring.sring;
++              sring->private.tapif_user.msg = BLKTAP2_RING_MESSAGE_CLOSE;
++              blktap_ring_kick_user(tap);
++      } else {
++              INIT_WORK(&tap->remove_work, blktap_sysfs_remove_work);
++              schedule_work(&tap->remove_work);
++      }
++wait:
++      err = wait_event_interruptible(tap->remove_wait,
++                                     !dev_get_drvdata(dev));
++      if (err)
++              return err;
++
++      return size;
++}
++static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
++
++static ssize_t
++blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
++{
++      struct blktap *tap;
++      char *s = buf, *end = buf + PAGE_SIZE;
++
++      tap = dev_get_drvdata(dev);
++      if (!tap)
++              return 0;
++
++      s += blktap_control_debug(tap, s, end - s);
++
++      s += blktap_request_debug(tap, s, end - s);
++
++      s += blktap_device_debug(tap, s, end - s);
++
++      s += blktap_ring_debug(tap, s, end - s);
++
++      return s - buf;
++}
++static DEVICE_ATTR(debug, S_IRUGO, blktap_sysfs_debug_device, NULL);
++
++static ssize_t
++blktap_sysfs_show_task(struct device *dev, struct device_attribute *attr, char *buf)
++{
++      struct blktap *tap;
++      ssize_t rv = 0;
++
++      tap = dev_get_drvdata(dev);
++      if (!tap)
++              return 0;
++
++      if (tap->ring.task)
++              rv = sprintf(buf, "%d\n", tap->ring.task->pid);
++
++      return rv;
++}
++static DEVICE_ATTR(task, S_IRUGO, blktap_sysfs_show_task, NULL);
++
++static ssize_t
++blktap_sysfs_show_pool(struct device *dev,
++                     struct device_attribute *attr,
++                     char *buf)
++{
++      struct blktap *tap = dev_get_drvdata(dev);
++      return sprintf(buf, "%s", kobject_name(&tap->pool->kobj));
++}
++
++static ssize_t
++blktap_sysfs_store_pool(struct device *dev,
++                      struct device_attribute *attr,
++                      const char *buf, size_t size)
++{
++      struct blktap *tap = dev_get_drvdata(dev);
++      struct blktap_page_pool *pool, *tmp = tap->pool;
++
++      if (tap->device.gd)
++              return -EBUSY;
++
++      pool = blktap_page_pool_get(buf);
++      if (IS_ERR(pool))
++              return PTR_ERR(pool);
++
++      tap->pool = pool;
++      kobject_put(&tmp->kobj);
++
++      return size;
++}
++static DEVICE_ATTR(pool, S_IRUSR|S_IWUSR,
++                 blktap_sysfs_show_pool, blktap_sysfs_store_pool);
++
++int
++blktap_sysfs_create(struct blktap *tap)
++{
++      struct blktap_ring *ring = &tap->ring;
++      struct device *dev;
++      int err = 0;
++
++      init_waitqueue_head(&tap->remove_wait);
++
++      dev = device_create(class, NULL, ring->devno,
++                          tap, "blktap%d", tap->minor);
++      if (IS_ERR(dev))
++              err = PTR_ERR(dev);
++      if (!err)
++              err = device_create_file(dev, &dev_attr_name);
++      if (!err)
++              err = device_create_file(dev, &dev_attr_remove);
++      if (!err)
++              err = device_create_file(dev, &dev_attr_debug);
++      if (!err)
++              err = device_create_file(dev, &dev_attr_task);
++      if (!err)
++              err = device_create_file(dev, &dev_attr_pool);
++      if (!err)
++              ring->dev = dev;
++      else
++              device_unregister(dev);
++
++      return err;
++}
++
++void
++blktap_sysfs_destroy(struct blktap *tap)
++{
++      struct blktap_ring *ring = &tap->ring;
++      struct device *dev;
++
++      dev = ring->dev;
++
++      if (!dev)
++              return;
++
++      dev_set_drvdata(dev, NULL);
++      wake_up(&tap->remove_wait);
++
++      device_unregister(dev);
++      ring->dev = NULL;
++}
++
++static ssize_t
++blktap_sysfs_show_verbosity(struct class *class, struct class_attribute *attr,
++                          char *buf)
++{
++      return sprintf(buf, "%d\n", blktap_debug_level);
++}
++
++static ssize_t
++blktap_sysfs_set_verbosity(struct class *class, struct class_attribute *attr,
++                         const char *buf, size_t size)
++{
++      int level;
++
++      if (sscanf(buf, "%d", &level) == 1) {
++              blktap_debug_level = level;
++              return size;
++      }
++
++      return -EINVAL;
++}
++static CLASS_ATTR(verbosity, S_IRUGO|S_IWUSR,
++                blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
++
++static ssize_t
++blktap_sysfs_show_devices(struct class *class, struct class_attribute *attr,
++                        char *buf)
++{
++      int i, ret;
++      struct blktap *tap;
++
++      mutex_lock(&blktap_lock);
++
++      ret = 0;
++      for (i = 0; i < blktap_max_minor; i++) {
++              tap = blktaps[i];
++              if (!tap)
++                      continue;
++
++              if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++                      continue;
++
++              ret += sprintf(buf + ret, "%d %s\n", tap->minor, tap->name);
++      }
++
++      mutex_unlock(&blktap_lock);
++
++      return ret;
++}
++static CLASS_ATTR(devices, S_IRUGO, blktap_sysfs_show_devices, NULL);
++
++static char *blktap_devnode(struct device *dev, mode_t *mode)
++{
++      return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "blktap%u",
++                       MINOR(dev->devt));
++}
++
++void
++blktap_sysfs_exit(void)
++{
++      if (class)
++              class_destroy(class);
++}
++
++int __init
++blktap_sysfs_init(void)
++{
++      struct class *cls;
++      int err = 0;
++
++      cls = class_create(THIS_MODULE, "blktap2");
++      if (IS_ERR(cls))
++              err = PTR_ERR(cls);
++      else
++              cls->devnode = blktap_devnode;
++      if (!err)
++              err = class_create_file(cls, &class_attr_verbosity);
++      if (!err)
++              err = class_create_file(cls, &class_attr_devices);
++      if (!err)
++              class = cls;
++      else
++              class_destroy(cls);
++
++      return err;
++}
diff --cc drivers/xen/blktap2/Makefile

index 0000000,0000000..8bb330c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2/Makefile
@@@ -1,0 -1,0 +1,4 @@@
++obj-$(CONFIG_XEN_BLKDEV_TAP2) := blktap2.o
++
++blktap2-y := control.o ring.o wait_queue.o device.o request.o
++blktap2-$(CONFIG_SYSFS) += sysfs.o
diff --cc drivers/xen/blktap2/blktap.h

index 0000000,0000000..4726348

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2/blktap.h
@@@ -1,0 -1,0 +1,264 @@@
++#ifndef _BLKTAP_H_
++#define _BLKTAP_H_
++
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/poll.h>
++#include <linux/cdev.h>
++#include <linux/init.h>
++#include <linux/scatterlist.h>
++#include <xen/blkif.h>
++#include <xen/gnttab.h>
++
++//#define ENABLE_PASSTHROUGH
++
++extern int blktap_debug_level;
++
++#define BTPRINTK(level, tag, force, _f, _a...)                                \
++      do {                                                            \
++              if (blktap_debug_level > level &&                       \
++                  (force || printk_ratelimit()))                      \
++                      printk(tag "%s: " _f, __func__, ##_a);          \
++      } while (0)
++
++#define BTDBG(_f, _a...)             BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
++#define BTINFO(_f, _a...)            BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
++#define BTWARN(_f, _a...)            BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
++#define BTERR(_f, _a...)             BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
++
++#define BLKTAP2_DEV_DIR "xen/blktap-2/"
++
++#define BLKTAP_CONTROL               1
++#define BLKTAP_RING_FD               2
++#define BLKTAP_RING_VMA              3
++#define BLKTAP_DEVICE                4
++#define BLKTAP_SYSFS                 5
++#define BLKTAP_PAUSE_REQUESTED       6
++#define BLKTAP_PAUSED                7
++#define BLKTAP_SHUTDOWN_REQUESTED    8
++#define BLKTAP_PASSTHROUGH           9
++#define BLKTAP_DEFERRED              10
++
++/* blktap IOCTLs: */
++#define BLKTAP2_IOCTL_KICK_FE        1
++#define BLKTAP2_IOCTL_ALLOC_TAP            200
++#define BLKTAP2_IOCTL_FREE_TAP       201
++#define BLKTAP2_IOCTL_CREATE_DEVICE  202
++#define BLKTAP2_IOCTL_SET_PARAMS     203
++#define BLKTAP2_IOCTL_PAUSE          204
++#define BLKTAP2_IOCTL_REOPEN         205
++#define BLKTAP2_IOCTL_RESUME         206
++
++#define BLKTAP2_MAX_MESSAGE_LEN      256
++
++#define BLKTAP2_RING_MESSAGE_PAUSE   1
++#define BLKTAP2_RING_MESSAGE_RESUME  2
++#define BLKTAP2_RING_MESSAGE_CLOSE   3
++
++#define BLKTAP_REQUEST_FREE          0
++#define BLKTAP_REQUEST_PENDING       1
++
++/*
++ * The maximum number of requests that can be outstanding at any time
++ * is determined by
++ *
++ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
++ *
++ * where mmap_alloc < MAX_DYNAMIC_MEM.
++ *
++ * TODO:
++ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
++ * sysfs.
++ */
++#define BLK_RING_SIZE         __CONST_RING_SIZE(blkif, PAGE_SIZE)
++#define MAX_DYNAMIC_MEM               BLK_RING_SIZE
++#define MAX_PENDING_REQS      BLK_RING_SIZE
++#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
++#define MMAP_VADDR(_start, _req, _seg)                                        \
++        (_start +                                                       \
++         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
++         ((_seg) * PAGE_SIZE))
++
++#define blktap_get(_b) (atomic_inc(&(_b)->refcnt))
++#define blktap_put(_b)                                        \
++      do {                                            \
++              if (atomic_dec_and_test(&(_b)->refcnt)) \
++                      wake_up(&(_b)->wq);             \
++      } while (0)
++
++struct blktap;
++
++struct grant_handle_pair {
++      grant_handle_t                 kernel;
++      grant_handle_t                 user;
++};
++#define INVALID_GRANT_HANDLE           0xFFFF
++
++struct blktap_handle {
++      unsigned int                   ring;
++      unsigned int                   device;
++      unsigned int                   minor;
++};
++
++struct blktap_params {
++      char                           name[BLKTAP2_MAX_MESSAGE_LEN];
++      unsigned long long             capacity;
++      unsigned long                  sector_size;
++};
++
++struct blktap_device {
++      int                            users;
++      spinlock_t                     lock;
++      struct gendisk                *gd;
++
++#ifdef ENABLE_PASSTHROUGH
++      struct block_device           *bdev;
++#endif
++};
++
++struct blktap_ring {
++      struct vm_area_struct         *vma;
++      blkif_front_ring_t             ring;
++      struct vm_foreign_map          foreign_map;
++      unsigned long                  ring_vstart;
++      unsigned long                  user_vstart;
++
++      int                            response;
++
++      wait_queue_head_t              poll_wait;
++
++      dev_t                          devno;
++      struct device                 *dev;
++      atomic_t                       sysfs_refcnt;
++      struct mutex                   sysfs_mutex;
++};
++
++struct blktap_statistics {
++      unsigned long                  st_print;
++      int                            st_rd_req;
++      int                            st_wr_req;
++      int                            st_oo_req;
++      int                            st_pk_req;
++      int                            st_rd_sect;
++      int                            st_wr_sect;
++      s64                            st_rd_cnt;
++      s64                            st_rd_sum_usecs;
++      s64                            st_rd_max_usecs;
++      s64                            st_wr_cnt;
++      s64                            st_wr_sum_usecs;
++      s64                            st_wr_max_usecs; 
++};
++
++struct blktap_request {
++      uint64_t                       id;
++      uint16_t                       usr_idx;
++
++      uint8_t                        status;
++      atomic_t                       pendcnt;
++      uint8_t                        nr_pages;
++      unsigned short                 operation;
++
++      struct timeval                 time;
++      struct grant_handle_pair       handles[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++      struct list_head               free_list;
++};
++
++struct blktap {
++      int                            minor;
++      pid_t                          pid;
++      atomic_t                       refcnt;
++      unsigned long                  dev_inuse;
++
++      struct blktap_params           params;
++
++      struct rw_semaphore            tap_sem;
++
++      struct blktap_ring             ring;
++      struct blktap_device           device;
++
++      int                            pending_cnt;
++      struct blktap_request         *pending_requests[MAX_PENDING_REQS];
++      struct scatterlist             sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++
++      wait_queue_head_t              wq;
++      struct list_head               deferred_queue;
++
++      struct blktap_statistics       stats;
++};
++
++extern struct blktap *blktaps[];
++
++static inline int
++blktap_active(struct blktap *tap)
++{
++      return test_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
++}
++
++static inline int
++blktap_validate_params(struct blktap *tap, struct blktap_params *params)
++{
++      /* TODO: sanity check */
++      params->name[sizeof(params->name) - 1] = '\0';
++      BTINFO("%s: capacity: %llu, sector-size: %lu\n",
++             params->name, params->capacity, params->sector_size);
++      return 0;
++}
++
++int blktap_control_destroy_device(struct blktap *);
++int blktap_control_finish_destroy(struct blktap *);
++
++int blktap_ring_init(int *);
++int blktap_ring_free(void);
++int blktap_ring_create(struct blktap *);
++int blktap_ring_destroy(struct blktap *);
++int blktap_ring_pause(struct blktap *);
++int blktap_ring_resume(struct blktap *);
++void blktap_ring_kick_user(struct blktap *);
++
++#ifdef CONFIG_SYSFS
++int blktap_sysfs_init(void);
++void blktap_sysfs_free(void);
++int blktap_sysfs_create(struct blktap *);
++int blktap_sysfs_destroy(struct blktap *);
++#else
++static inline int blktap_sysfs_init(void) { return 0; }
++static inline void blktap_sysfs_exit(void) {}
++static inline int blktap_sysfs_create(struct blktap *tapdev) { return 0; }
++static inline int blktap_sysfs_destroy(struct blktap *tapdev) { return 0; }
++#endif
++
++int blktap_device_init(int *);
++void blktap_device_free(void);
++int blktap_device_create(struct blktap *);
++int blktap_device_destroy(struct blktap *);
++int blktap_device_pause(struct blktap *);
++int blktap_device_resume(struct blktap *);
++void blktap_device_restart(struct blktap *);
++void blktap_device_finish_request(struct blktap *,
++                                blkif_response_t *,
++                                struct blktap_request *);
++void blktap_device_fail_pending_requests(struct blktap *);
++#ifdef ENABLE_PASSTHROUGH
++int blktap_device_enable_passthrough(struct blktap *,
++                                   unsigned, unsigned);
++#endif
++
++void blktap_defer(struct blktap *);
++void blktap_run_deferred(void);
++
++int blktap_request_pool_init(void);
++void blktap_request_pool_free(void);
++int blktap_request_pool_grow(void);
++int blktap_request_pool_shrink(void);
++struct blktap_request *blktap_request_allocate(struct blktap *);
++void blktap_request_free(struct blktap *, struct blktap_request *);
++struct page *request_to_page(struct blktap_request *, int);
++
++static inline unsigned long
++request_to_kaddr(struct blktap_request *req, int seg)
++{
++      unsigned long pfn = page_to_pfn(request_to_page(req, seg));
++      return (unsigned long)pfn_to_kaddr(pfn);
++}
++
++#endif
diff --cc drivers/xen/blktap2/control.c

index 0000000,0000000..f447143

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2/control.c
@@@ -1,0 -1,0 +1,285 @@@
++#include <linux/module.h>
++#include <linux/miscdevice.h>
++
++#include "blktap.h"
++
++static DEFINE_SPINLOCK(blktap_control_lock);
++struct blktap *blktaps[CONFIG_XEN_NR_TAP2_DEVICES];
++
++static int ring_major;
++static int device_major;
++static int blktap_control_registered;
++
++static void
++blktap_control_initialize_tap(struct blktap *tap)
++{
++      int minor = tap->minor;
++
++      memset(tap, 0, sizeof(*tap));
++      set_bit(BLKTAP_CONTROL, &tap->dev_inuse);
++      init_rwsem(&tap->tap_sem);
++      sg_init_table(tap->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++      init_waitqueue_head(&tap->wq);
++      atomic_set(&tap->refcnt, 0);
++
++      tap->minor = minor;
++}
++
++static struct blktap *
++blktap_control_create_tap(void)
++{
++      int minor;
++      struct blktap *tap;
++
++      tap = kmalloc(sizeof(*tap), GFP_KERNEL);
++      if (unlikely(!tap))
++              return NULL;
++
++      blktap_control_initialize_tap(tap);
++
++      spin_lock_irq(&blktap_control_lock);
++      for (minor = 0; minor < CONFIG_XEN_NR_TAP2_DEVICES; minor++)
++              if (!blktaps[minor])
++                      break;
++
++      if (minor == CONFIG_XEN_NR_TAP2_DEVICES) {
++              kfree(tap);
++              tap = NULL;
++              goto out;
++      }
++
++      tap->minor = minor;
++      blktaps[minor] = tap;
++
++out:
++      spin_unlock_irq(&blktap_control_lock);
++      return tap;
++}
++
++static struct blktap *
++blktap_control_allocate_tap(void)
++{
++      int err, minor;
++      struct blktap *tap;
++
++      /*
++       * This is called only from the ioctl, which
++       * means we should always have interrupts enabled.
++       */
++      BUG_ON(irqs_disabled());
++
++      spin_lock_irq(&blktap_control_lock);
++
++      for (minor = 0; minor < CONFIG_XEN_NR_TAP2_DEVICES; minor++) {
++              tap = blktaps[minor];
++              if (!tap)
++                      goto found;
++
++              if (!tap->dev_inuse) {
++                      blktap_control_initialize_tap(tap);
++                      goto found;
++              }
++      }
++
++      tap = NULL;
++
++found:
++      spin_unlock_irq(&blktap_control_lock);
++
++      if (!tap) {
++              tap = blktap_control_create_tap();
++              if (!tap)
++                      return NULL;
++      }
++
++      err = blktap_ring_create(tap);
++      if (err) {
++              BTERR("ring creation failed: %d\n", err);
++              clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
++              return NULL;
++      }
++
++      BTINFO("allocated tap %p\n", tap);
++      return tap;
++}
++
++static long
++blktap_control_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
++{
++      unsigned long dev;
++      struct blktap *tap;
++
++      switch (cmd) {
++      case BLKTAP2_IOCTL_ALLOC_TAP: {
++              struct blktap_handle h;
++
++              tap = blktap_control_allocate_tap();
++              if (!tap) {
++                      BTERR("error allocating device\n");
++                      return -ENOMEM;
++              }
++
++              h.ring   = ring_major;
++              h.device = device_major;
++              h.minor  = tap->minor;
++
++              if (copy_to_user((struct blktap_handle __user *)arg,
++                               &h, sizeof(h))) {
++                      blktap_control_destroy_device(tap);
++                      return -EFAULT;
++              }
++
++              return 0;
++      }
++
++      case BLKTAP2_IOCTL_FREE_TAP:
++              dev = arg;
++
++              if (dev >= CONFIG_XEN_NR_TAP2_DEVICES || !blktaps[dev])
++                      return -EINVAL;
++
++              blktap_control_destroy_device(blktaps[dev]);
++              return 0;
++      }
++
++      return -ENOIOCTLCMD;
++}
++
++static const struct file_operations blktap_control_file_operations = {
++      .owner    = THIS_MODULE,
++      .unlocked_ioctl = blktap_control_ioctl,
++};
++
++static struct miscdevice blktap_misc = {
++      .minor    = MISC_DYNAMIC_MINOR,
++      .name     = "blktap-control",
++      .nodename = BLKTAP2_DEV_DIR "control",
++      .fops     = &blktap_control_file_operations,
++};
++
++int
++blktap_control_destroy_device(struct blktap *tap)
++{
++      int err;
++      unsigned long inuse;
++
++      if (!tap)
++              return 0;
++
++      set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
++
++      for (;;) {
++              inuse = tap->dev_inuse;
++              err   = blktap_device_destroy(tap);
++              if (err)
++                      goto wait;
++
++              inuse = tap->dev_inuse;
++              err   = blktap_ring_destroy(tap);
++              if (err)
++                      goto wait;
++
++              inuse = tap->dev_inuse;
++              err   = blktap_sysfs_destroy(tap);
++              if (err)
++                      goto wait;
++
++              break;
++
++      wait:
++              BTDBG("inuse: 0x%lx, dev_inuse: 0x%lx\n",
++                    inuse, tap->dev_inuse);
++              if (wait_event_interruptible(tap->wq, tap->dev_inuse != inuse))
++                      break;
++      }
++
++      clear_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
++
++      if (blktap_control_finish_destroy(tap))
++              err = 0;
++
++      return err;
++}
++
++int
++blktap_control_finish_destroy(struct blktap *tap)
++{
++      if (tap->dev_inuse == (1UL << BLKTAP_CONTROL))
++              clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
++      return !tap->dev_inuse;
++}
++
++static int __init
++blktap_control_init(void)
++{
++      int err;
++
++      err = misc_register(&blktap_misc);
++      if (err) {
++              BTERR("misc_register failed for control device");
++              return err;
++      }
++
++      blktap_control_registered = 1;
++      return 0;
++}
++
++static void
++blktap_control_free(void)
++{
++      int i;
++
++      for (i = 0; i < CONFIG_XEN_NR_TAP2_DEVICES; i++)
++              blktap_control_destroy_device(blktaps[i]);
++
++      if (blktap_control_registered)
++              if (misc_deregister(&blktap_misc) < 0)
++                      BTERR("misc_deregister failed for control device");
++}
++
++static void
++blktap_exit(void)
++{
++      blktap_control_free();
++      blktap_ring_free();
++      blktap_sysfs_free();
++      blktap_device_free();
++      blktap_request_pool_free();
++}
++
++static int __init
++blktap_init(void)
++{
++      int err;
++
++      err = blktap_request_pool_init();
++      if (err)
++              return err;
++
++      err = blktap_device_init(&device_major);
++      if (err)
++              goto fail;
++
++      err = blktap_ring_init(&ring_major);
++      if (err)
++              goto fail;
++
++      err = blktap_sysfs_init();
++      if (err)
++              goto fail;
++
++      err = blktap_control_init();
++      if (err)
++              goto fail;
++
++      return 0;
++
++fail:
++      blktap_exit();
++      return err;
++}
++
++module_init(blktap_init);
++module_exit(blktap_exit);
++MODULE_LICENSE("Dual BSD/GPL");
++MODULE_ALIAS("devname:" BLKTAP2_DEV_DIR "control");
diff --cc drivers/xen/blktap2/device.c

index 0000000,0000000..926306c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2/device.c
@@@ -1,0 -1,0 +1,1196 @@@
++#include <linux/fs.h>
++#include <linux/blkdev.h>
++#include <linux/cdrom.h>
++#include <linux/hdreg.h>
++#include <linux/module.h>
++#include <linux/version.h>
++#include <asm/tlbflush.h>
++
++#include <scsi/scsi.h>
++#include <scsi/scsi_ioctl.h>
++
++#include <xen/xenbus.h>
++#include <xen/interface/io/blkif.h>
++
++#include "blktap.h"
++
++#include "../blkback/blkback-pagemap.h"
++
++#if 0
++#define DPRINTK_IOCTL(_f, _a...) pr_alert(_f, ## _a)
++#else
++#define DPRINTK_IOCTL(_f, _a...) ((void)0)
++#endif
++
++struct blktap_grant_table {
++      int cnt;
++      struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
++};
++
++static int blktap_device_major;
++
++static inline struct blktap *
++dev_to_blktap(struct blktap_device *dev)
++{
++      return container_of(dev, struct blktap, device);
++}
++
++static int
++blktap_device_open(struct block_device *bd, fmode_t mode)
++{
++      struct blktap *tap;
++      struct blktap_device *dev = bd->bd_disk->private_data;
++
++      if (!dev)
++              return -ENOENT;
++
++      tap = dev_to_blktap(dev);
++      if (!blktap_active(tap) ||
++          test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++              return -ENOENT;
++
++      dev->users++;
++
++      return 0;
++}
++
++static int
++blktap_device_release(struct gendisk *disk, fmode_t mode)
++{
++      struct blktap_device *dev = disk->private_data;
++      struct blktap *tap = dev_to_blktap(dev);
++
++      dev->users--;
++      if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++              blktap_device_destroy(tap);
++
++      return 0;
++}
++
++static int
++blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
++{
++      /* We don't have real geometry info, but let's at least return
++         values consistent with the size of the device */
++      sector_t nsect = get_capacity(bd->bd_disk);
++      sector_t cylinders = nsect;
++
++      hg->heads = 0xff;
++      hg->sectors = 0x3f;
++      sector_div(cylinders, hg->heads * hg->sectors);
++      hg->cylinders = cylinders;
++      if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
++              hg->cylinders = 0xffff;
++      return 0;
++}
++
++static int
++blktap_device_ioctl(struct block_device *bd, fmode_t mode,
++                  unsigned command, unsigned long argument)
++{
++      int i;
++
++      DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx\n",
++                    command, (long)argument);
++
++      switch (command) {
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
++      case HDIO_GETGEO: {
++              struct hd_geometry geo;
++              int ret;
++
++                if (!argument)
++                        return -EINVAL;
++
++              geo.start = get_start_sect(bd);
++              ret = blktap_device_getgeo(bd, &geo);
++              if (ret)
++                      return ret;
++
++              if (copy_to_user((struct hd_geometry __user *)argument, &geo,
++                               sizeof(geo)))
++                        return -EFAULT;
++
++                return 0;
++      }
++#endif
++      case CDROMMULTISESSION:
++              BTDBG("FIXME: support multisession CDs later\n");
++              for (i = 0; i < sizeof(struct cdrom_multisession); i++)
++                      if (put_user(0, (char __user *)(argument + i)))
++                              return -EFAULT;
++              return 0;
++
++      case SCSI_IOCTL_GET_IDLUN:
++              if (!access_ok(VERIFY_WRITE, argument, 
++                      sizeof(struct scsi_idlun)))
++                      return -EFAULT;
++
++              /* return 0 for now. */
++              __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
++              __put_user(0, 
++                      &((struct scsi_idlun __user *)argument)->host_unique_id);
++              return 0;
++
++      default:
++              return -EINVAL; /* same return as native Linux */
++      }
++
++      return 0;
++}
++
++static const struct block_device_operations blktap_device_file_operations = {
++      .owner     = THIS_MODULE,
++      .open      = blktap_device_open,
++      .release   = blktap_device_release,
++      .ioctl     = blktap_device_ioctl,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
++      .getgeo    = blktap_device_getgeo
++#endif
++};
++
++static int
++blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page,
++                  unsigned long addr, void *data)
++{
++      pte_t *pte = (pte_t *)data;
++
++      BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte));
++      set_pte(ptep, *pte);
++      return 0;
++}
++
++static int
++blktap_map_uaddr(struct vm_area_struct *vma, unsigned long address, pte_t pte)
++{
++      return apply_to_page_range(vma ? vma->vm_mm : NULL, address,
++                                 PAGE_SIZE, blktap_map_uaddr_fn, &pte);
++}
++
++static int
++blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
++                   unsigned long addr, void *data)
++{
++      struct vm_area_struct *vma = data;
++
++      BTDBG("ptep %p\n", ptep);
++      xen_ptep_get_and_clear_full(vma, addr, ptep, 1);
++      return 0;
++}
++
++static int
++blktap_umap_uaddr(struct vm_area_struct *vma, unsigned long address)
++{
++      struct mm_struct *mm = NULL;
++
++      if (!vma) {
++#ifdef CONFIG_X86
++              if (HYPERVISOR_update_va_mapping(address, __pte(0),
++                                               UVMF_INVLPG|UVMF_ALL))
++                      BUG();
++              return 1;
++#endif
++      } else
++              mm = vma->vm_mm;
++      return apply_to_page_range(mm, address,
++                                 PAGE_SIZE, blktap_umap_uaddr_fn, vma);
++}
++
++static inline void
++flush_tlb_kernel_page(unsigned long kvaddr)
++{
++#ifdef CONFIG_X86
++      xen_invlpg_all(kvaddr);
++#else
++      flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE);
++#endif
++}
++
++/*
++ * tap->tap_sem held on entry
++ */
++static void
++blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request)
++{
++      uint64_t ptep;
++      int ret, usr_idx;
++      unsigned int i, cnt;
++      struct page **map, *page;
++      struct blktap_ring *ring;
++      struct grant_handle_pair *khandle;
++      unsigned long kvaddr, uvaddr, offset;
++      struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
++      grant_handle_t self_gref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++      int self_gref_nr = 0;
++
++      cnt     = 0;
++      ring    = &tap->ring;
++      usr_idx = request->usr_idx;
++      map     = ring->foreign_map.map;
++
++      if (!ring->vma)
++              return;
++
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              zap_page_range(ring->vma, 
++                             MMAP_VADDR(ring->user_vstart, usr_idx, 0),
++                             request->nr_pages << PAGE_SHIFT, NULL);
++
++      for (i = 0; i < request->nr_pages; i++) {
++              kvaddr = request_to_kaddr(request, i);
++              uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++
++              khandle = request->handles + i;
++
++              if (khandle->kernel != INVALID_GRANT_HANDLE) {
++                      gnttab_set_unmap_op(&unmap[cnt], kvaddr,
++                                          GNTMAP_host_map, khandle->kernel);
++                      cnt++;
++                      set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
++                                          INVALID_P2M_ENTRY);
++              }
++
++              if (khandle->user != INVALID_GRANT_HANDLE) {
++                      BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++                      if (create_lookup_pte_addr(ring->vma->vm_mm,
++                                                 uvaddr, &ptep) != 0) {
++                              BTERR("Couldn't get a pte addr!\n");
++                              return;
++                      }
++
++                      gnttab_set_unmap_op(&unmap[cnt], ptep,
++                                          GNTMAP_host_map
++                                          | GNTMAP_application_map
++                                          | GNTMAP_contains_pte,
++                                          khandle->user);
++                      cnt++;
++              }
++
++              offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
++
++              BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, "
++                    "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: "
++                    "0x%08lx, handle: %u\n", offset, map[offset], request,
++                    usr_idx, i, kvaddr, khandle->kernel, uvaddr,
++                    khandle->user);
++
++              page = map[offset];
++              if (page) {
++                      if (PageBlkback(page)) {
++                              ClearPageBlkback(page);
++                              set_page_private(page, 0);
++                      } else if (
++                              xen_feature(XENFEAT_auto_translated_physmap)) {
++                              self_gref[self_gref_nr] = khandle->kernel;
++                              self_gref_nr++;
++                      }
++              }
++              map[offset] = NULL;
++
++              khandle->kernel = INVALID_GRANT_HANDLE;
++              khandle->user   = INVALID_GRANT_HANDLE;
++      }
++
++      if (cnt) {
++              ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++                                              unmap, cnt);
++              BUG_ON(ret);
++      }
++
++      if (!xen_feature(XENFEAT_auto_translated_physmap))
++              zap_page_range(ring->vma, 
++                             MMAP_VADDR(ring->user_vstart, usr_idx, 0), 
++                             request->nr_pages << PAGE_SHIFT, NULL);
++      else {
++              for (i = 0; i < self_gref_nr; i++) {
++                      gnttab_end_foreign_access_ref(self_gref[i]);
++              }
++      }
++}
++
++/*
++ * tap->tap_sem held on entry
++ */
++static void
++blktap_unmap(struct blktap *tap, struct blktap_request *request)
++{
++      int i, usr_idx;
++      unsigned long kvaddr;
++
++      usr_idx = request->usr_idx;
++      down_write(&tap->ring.vma->vm_mm->mmap_sem);
++
++      for (i = 0; i < request->nr_pages; i++) {
++              kvaddr = request_to_kaddr(request, i);
++              BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, "
++                    "uvaddr: 0x%08lx, uhandle: %u\n", request, i,
++                    kvaddr, request->handles[i].kernel,
++                    MMAP_VADDR(tap->ring.user_vstart, usr_idx, i),
++                    request->handles[i].user);
++
++              if (!xen_feature(XENFEAT_auto_translated_physmap) &&
++                  request->handles[i].kernel == INVALID_GRANT_HANDLE) {
++                      if (blktap_umap_uaddr(NULL, kvaddr) == 0)
++                              flush_tlb_kernel_page(kvaddr);
++                      set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
++                                          INVALID_P2M_ENTRY);
++              }
++      }
++
++      blktap_device_fast_flush(tap, request);
++      up_write(&tap->ring.vma->vm_mm->mmap_sem);
++}
++
++/*
++ * called if the tapdisk process dies unexpectedly.
++ * fail and release any pending requests and disable queue.
++ */
++void
++blktap_device_fail_pending_requests(struct blktap *tap)
++{
++      int usr_idx;
++      struct request *req;
++      struct blktap_device *dev;
++      struct blktap_request *request;
++
++      if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++              return;
++
++      down_write(&tap->tap_sem);
++
++      dev = &tap->device;
++      for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
++              request = tap->pending_requests[usr_idx];
++              if (!request || request->status != BLKTAP_REQUEST_PENDING)
++                      continue;
++
++              BTERR("%u:%u: failing pending %s of %d pages\n",
++                    blktap_device_major, tap->minor,
++                    (request->operation == BLKIF_OP_PACKET ?
++                     "packet" : request->operation == BLKIF_OP_READ ?
++                     "read" : "write"), request->nr_pages);
++
++              blktap_unmap(tap, request);
++              req = (struct request *)(unsigned long)request->id;
++              blk_end_request_all(req, -ENODEV);
++              blktap_request_free(tap, request);
++      }
++
++      up_write(&tap->tap_sem);
++
++      spin_lock_irq(&dev->lock);
++
++      /* fail any future requests */
++      dev->gd->queue->queuedata = NULL;
++      blk_start_queue(dev->gd->queue);
++
++      spin_unlock_irq(&dev->lock);
++}
++
++/*
++ * tap->tap_sem held on entry
++ */
++void
++blktap_device_finish_request(struct blktap *tap,
++                           blkif_response_t *res,
++                           struct blktap_request *request)
++{
++      struct request *req;
++
++      blktap_unmap(tap, request);
++
++      req = (struct request *)(unsigned long)request->id;
++
++      BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
++            res->status, res->operation, request->operation,
++            (unsigned long long)res->id);
++
++      switch (request->operation) {
++      case BLKIF_OP_READ:
++      case BLKIF_OP_WRITE:
++      case BLKIF_OP_PACKET:
++              if (unlikely(res->status != BLKIF_RSP_OKAY))
++                      BTERR("Bad return from device data "
++                              "request: %x\n", res->status);
++              blk_end_request_all(req,
++                      res->status == BLKIF_RSP_OKAY ? 0 : -EIO);
++              break;
++      default:
++              BUG();
++      }
++
++      blktap_request_free(tap, request);
++}
++
++static int
++blktap_prep_foreign(struct blktap *tap,
++                  struct blktap_request *request,
++                  blkif_request_t *blkif_req,
++                  unsigned int seg, struct page *page,
++                  struct blktap_grant_table *table)
++{
++      uint64_t ptep;
++      uint32_t flags;
++      struct page *tap_page;
++      struct blktap_ring *ring;
++      struct blkback_pagemap map;
++      unsigned long uvaddr, kvaddr;
++
++      ring = &tap->ring;
++      map  = blkback_pagemap_read(page);
++      blkif_req->seg[seg].gref = map.gref;
++
++      uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
++      kvaddr = request_to_kaddr(request, seg);
++      flags  = GNTMAP_host_map |
++              (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0);
++
++      gnttab_set_map_op(&table->grants[table->cnt],
++                        kvaddr, flags, map.gref, map.domid);
++      table->cnt++;
++
++      /* enable chained tap devices */
++      tap_page = request_to_page(request, seg);
++      set_page_private(tap_page, page_private(page));
++      SetPageBlkback(tap_page);
++
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              return 0;
++
++      if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) {
++              BTERR("couldn't get a pte addr!\n");
++              return -1;
++      }
++
++      flags |= GNTMAP_application_map | GNTMAP_contains_pte;
++      gnttab_set_map_op(&table->grants[table->cnt],
++                        ptep, flags, map.gref, map.domid);
++      table->cnt++;
++
++      return 0;
++}
++
++static int
++blktap_map_foreign(struct blktap *tap,
++                 struct blktap_request *request,
++                 blkif_request_t *blkif_req,
++                 struct blktap_grant_table *table)
++{
++      struct page *page;
++      int i, grant, err, usr_idx;
++      struct blktap_ring *ring;
++      unsigned long uvaddr, foreign_mfn;
++
++      if (!table->cnt)
++              return 0;
++
++      err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++                                      table->grants, table->cnt);
++      BUG_ON(err);
++
++      grant   = 0;
++      usr_idx = request->usr_idx;
++      ring    = &tap->ring;
++
++      for (i = 0; i < request->nr_pages; i++) {
++              if (!blkif_req->seg[i].gref)
++                      continue;
++
++              uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++
++              if (unlikely(table->grants[grant].status != GNTST_okay)) {
++                      BTERR("invalid kernel buffer: could not remap it\n");
++                      /* This should never happen: blkback should handle eagain first */
++                      BUG_ON(table->grants[grant].status == GNTST_eagain);
++                      err |= 1;
++                      table->grants[grant].handle = INVALID_GRANT_HANDLE;
++              }
++
++              request->handles[i].kernel = table->grants[grant].handle;
++              foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT;
++              grant++;
++
++              if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++                      if (unlikely(table->grants[grant].status != GNTST_okay)) {
++                              /* This should never happen: blkback should handle eagain first */
++                              WARN_ON(table->grants[grant].status == GNTST_eagain);
++                              BTERR("invalid user buffer: could not remap it\n");
++                              err |= 1;
++                              table->grants[grant].handle = INVALID_GRANT_HANDLE;
++                      }
++                      request->handles[i].user = table->grants[grant].handle;
++                      grant++;
++              }
++
++              if (err)
++                      continue;
++
++              page = request_to_page(request, i);
++
++              if (!xen_feature(XENFEAT_auto_translated_physmap))
++                      set_phys_to_machine(page_to_pfn(page),
++                                          FOREIGN_FRAME(foreign_mfn));
++              else if (vm_insert_page(ring->vma, uvaddr, page))
++                      err |= 1;
++
++              BTDBG("pending_req: %p, seg: %d, page: %p, "
++                    "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, "
++                    "uhandle: %u\n", request, i, page,
++                    pfn_to_kaddr(page_to_pfn(page)),
++                    request->handles[i].kernel,
++                    uvaddr, request->handles[i].user);
++      }
++
++      return err;
++}
++
++static int
++blktap_map(struct blktap *tap,
++         struct blktap_request *request,
++         unsigned int seg, struct page *page)
++{
++      pte_t pte;
++      int usr_idx;
++      struct blktap_ring *ring;
++      unsigned long uvaddr, kvaddr;
++      int err = 0;
++
++      ring    = &tap->ring;
++      usr_idx = request->usr_idx;
++      uvaddr  = MMAP_VADDR(ring->user_vstart, usr_idx, seg);
++      kvaddr  = request_to_kaddr(request, seg);
++
++      if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++              pte = mk_pte(page, ring->vma->vm_page_prot);
++              blktap_map_uaddr(ring->vma, uvaddr,
++                               pte_mkspecial(pte_mkwrite(pte)));
++              flush_tlb_page(ring->vma, uvaddr);
++              blktap_map_uaddr(NULL, kvaddr, mk_pte(page, PAGE_KERNEL));
++              flush_tlb_kernel_page(kvaddr);
++
++              set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
++              request->handles[seg].kernel = INVALID_GRANT_HANDLE;
++      } else {
++              /* grant this page access to self domain and map it. */
++              domid_t domid = 0; /* XXX my domian id: grant table hypercall
++                                    doesn't understand DOMID_SELF */
++              int gref;
++              uint32_t flags;
++              struct gnttab_map_grant_ref map;
++              struct page *tap_page;
++
++              gref = gnttab_grant_foreign_access(
++                      domid, page_to_pfn(page),
++                      (request->operation == BLKIF_OP_WRITE)?
++                      GTF_readonly: 0);
++
++              flags  = GNTMAP_host_map |
++                      (request->operation == BLKIF_OP_WRITE ?
++                       GNTMAP_readonly : 0);
++
++              gnttab_set_map_op(&map, kvaddr, flags, gref, domid);
++
++              /* enable chained tap devices */
++              tap_page = request_to_page(request, seg);
++              set_page_private(tap_page, page_private(page));
++              SetPageBlkback(tap_page);
++
++              gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map);
++
++              /* We are not expecting the grant op to fail */
++              BUG_ON(map.status != GNTST_okay);
++
++              err = vm_insert_page(ring->vma, uvaddr, tap_page);
++              if (err) {
++                      struct gnttab_unmap_grant_ref unmap;
++                      gnttab_set_unmap_op(&unmap, kvaddr,
++                                          GNTMAP_host_map, gref);
++                      VOID(HYPERVISOR_grant_table_op(
++                              GNTTABOP_unmap_grant_ref, &unmap, 1));
++              } else
++                      request->handles[seg].kernel = gref;
++      }
++      request->handles[seg].user = INVALID_GRANT_HANDLE;
++
++      BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, "
++            "uvaddr: 0x%08lx\n", request, seg, page, kvaddr,
++            uvaddr);
++
++      return err;
++}
++
++static int
++blktap_device_process_request(struct blktap *tap,
++                            struct blktap_request *request,
++                            struct request *req)
++{
++      struct page *page;
++      int i, usr_idx, err;
++      struct blktap_ring *ring;
++      struct scatterlist *sg;
++      struct blktap_grant_table table;
++      unsigned int fsect, lsect, nr_sects;
++      unsigned long offset, uvaddr;
++      struct blkif_request blkif_req, *target;
++
++      err = -1;
++      memset(&table, 0, sizeof(table));
++
++      if (!blktap_active(tap))
++              goto out;
++
++      ring    = &tap->ring;
++      usr_idx = request->usr_idx;
++      blkif_req.id = usr_idx;
++      blkif_req.sector_number = (blkif_sector_t)blk_rq_pos(req);
++      blkif_req.handle = 0;
++      blkif_req.operation = rq_data_dir(req) ?
++              BLKIF_OP_WRITE : BLKIF_OP_READ;
++      if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
++              blkif_req.operation = BLKIF_OP_PACKET;
++
++      request->id        = (unsigned long)req;
++      request->operation = blkif_req.operation;
++      request->status    = BLKTAP_REQUEST_PENDING;
++      do_gettimeofday(&request->time);
++
++      nr_sects = 0;
++      request->nr_pages = 0;
++      blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg);
++      BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
++      for_each_sg(tap->sg, sg, blkif_req.nr_segments, i) {
++                      fsect = sg->offset >> 9;
++                      lsect = fsect + (sg->length >> 9) - 1;
++                      nr_sects += sg->length >> 9;
++
++                      blkif_req.seg[i] =
++                              (struct blkif_request_segment) {
++                              .gref       = 0,
++                              .first_sect = fsect,
++                              .last_sect  = lsect };
++
++                      if (PageBlkback(sg_page(sg))) {
++                              /* foreign page -- use xen */
++                              if (blktap_prep_foreign(tap,
++                                                      request,
++                                                      &blkif_req,
++                                                      i,
++                                                      sg_page(sg),
++                                                      &table))
++                                      goto out;
++                      } else {
++                              /* do it the old fashioned way */
++                              if (blktap_map(tap,
++                                             request,
++                                             i,
++                                             sg_page(sg)))
++                                      goto out;
++                      }
++
++                      uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++                      offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
++                      page   = request_to_page(request, i);
++                      ring->foreign_map.map[offset] = page;
++                      SetPageReserved(page);
++
++                      BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n",
++                            uvaddr, page, page_to_pfn(page));
++                      BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, "
++                            "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n",
++                            offset, request, i,
++                            page, pfn_to_kaddr(page_to_pfn(page)), uvaddr);
++
++                      request->nr_pages++;
++      }
++
++      if (blktap_map_foreign(tap, request, &blkif_req, &table))
++              goto out;
++
++      /* Finally, write the request message to the user ring. */
++      target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
++      memcpy(target, &blkif_req, sizeof(blkif_req));
++      target->id = request->usr_idx;
++      wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
++      ring->ring.req_prod_pvt++;
++
++      if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
++              tap->stats.st_pk_req++;
++      else if (rq_data_dir(req)) {
++              tap->stats.st_wr_sect += nr_sects;
++              tap->stats.st_wr_req++;
++      } else {
++              tap->stats.st_rd_sect += nr_sects;
++              tap->stats.st_rd_req++;
++      }
++
++      err = 0;
++
++out:
++      if (err)
++              blktap_device_fast_flush(tap, request);
++      return err;
++}
++
++#ifdef ENABLE_PASSTHROUGH
++#define rq_for_each_bio_safe(_bio, _tmp, _req)                                \
++      if ((_req)->bio)                                                \
++              for (_bio = (_req)->bio;                                \
++                   _bio && ((_tmp = _bio->bi_next) || 1);             \
++                   _bio = _tmp)
++
++static void
++blktap_device_forward_request(struct blktap *tap, struct request *req)
++{
++      struct bio *bio, *tmp;
++      struct blktap_device *dev;
++
++      dev = &tap->device;
++
++      rq_for_each_bio_safe(bio, tmp, req) {
++              bio->bi_bdev = dev->bdev;
++              submit_bio(bio->bi_rw, bio);
++      }
++}
++
++static void
++blktap_device_close_bdev(struct blktap *tap)
++{
++      struct blktap_device *dev;
++
++      dev = &tap->device;
++
++      if (dev->bdev)
++              blkdev_put(dev->bdev, FMODE_WRITE|FMODE_EXCL);
++
++      dev->bdev = NULL;
++      clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
++}
++
++static int
++blktap_device_open_bdev(struct blktap *tap, u32 pdev)
++{
++      struct block_device *bdev;
++      struct blktap_device *dev;
++
++      dev = &tap->device;
++
++      bdev = blkdev_get_by_dev(pdev, FMODE_WRITE|FMODE_EXCL, tap);
++      if (IS_ERR(bdev)) {
++              BTERR("opening device %x:%x failed: %ld\n",
++                    MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
++              return PTR_ERR(bdev);
++      }
++
++      if (!bdev->bd_disk) {
++              BTERR("device %x:%x doesn't exist\n",
++                    MAJOR(pdev), MINOR(pdev));
++              blkdev_put(bdev, FMODE_WRITE|FMODE_EXCL);
++              return -ENOENT;
++      }
++
++      dev->bdev = bdev;
++      set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
++
++      /* TODO: readjust queue parameters */
++
++      BTINFO("set device %d to passthrough on %x:%x\n",
++             tap->minor, MAJOR(pdev), MINOR(pdev));
++
++      return 0;
++}
++
++int
++blktap_device_enable_passthrough(struct blktap *tap,
++                               unsigned major, unsigned minor)
++{
++      u32 pdev;
++      struct blktap_device *dev;
++
++      dev  = &tap->device;
++      pdev = MKDEV(major, minor);
++
++      if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++              return -EINVAL;
++
++      if (dev->bdev) {
++              if (pdev)
++                      return -EINVAL;
++              blktap_device_close_bdev(tap);
++              return 0;
++      }
++
++      return blktap_device_open_bdev(tap, pdev);
++}
++#endif
++
++/*
++ * dev->lock held on entry
++ */
++static void
++blktap_device_run_queue(struct blktap *tap)
++{
++      int queued, err;
++      struct request_queue *rq;
++      struct request *req;
++      struct blktap_ring *ring;
++      struct blktap_device *dev;
++      struct blktap_request *request;
++
++      queued = 0;
++      ring   = &tap->ring;
++      dev    = &tap->device;
++      rq     = dev->gd->queue;
++
++      BTDBG("running queue for %d\n", tap->minor);
++
++      while ((req = blk_peek_request(rq)) != NULL) {
++              if (req->cmd_type != REQ_TYPE_FS) {
++                      blk_start_request(req);
++                      __blk_end_request_all(req, -EIO);
++                      continue;
++              }
++
++              if (req->cmd_flags & (REQ_FLUSH|REQ_FUA)) {
++                      blk_start_request(req);
++                      __blk_end_request_all(req, -EOPNOTSUPP);
++                      continue;
++              }
++
++#ifdef ENABLE_PASSTHROUGH
++              if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
++                      blk_start_request(req);
++                      blktap_device_forward_request(tap, req);
++                      continue;
++              }
++#endif
++
++              if (RING_FULL(&ring->ring)) {
++              wait:
++                      /* Avoid pointless unplugs. */
++                      blk_stop_queue(rq);
++                      blktap_defer(tap);
++                      break;
++              }
++
++              request = blktap_request_allocate(tap);
++              if (!request) {
++                      tap->stats.st_oo_req++;
++                      goto wait;
++              }
++
++              BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) "
++                    "buffer:%p [%s], pending: %p\n", req, tap->minor,
++                    req->cmd, (unsigned long long)blk_rq_pos(req),
++                    blk_rq_cur_sectors(req), blk_rq_sectors(req), req->buffer,
++                    rq_data_dir(req) ? "write" : "read", request);
++
++              blk_start_request(req);
++
++              spin_unlock_irq(&dev->lock);
++              down_write(&tap->tap_sem);
++
++              err = blktap_device_process_request(tap, request, req);
++              if (!err)
++                      queued++;
++              else {
++                      blk_end_request_all(req, err);
++                      blktap_request_free(tap, request);
++              }
++
++              up_write(&tap->tap_sem);
++              spin_lock_irq(&dev->lock);
++      }
++
++      if (queued)
++              blktap_ring_kick_user(tap);
++}
++
++/*
++ * dev->lock held on entry
++ */
++static void
++blktap_device_do_request(struct request_queue *rq)
++{
++      struct request *req;
++      struct blktap *tap;
++      struct blktap_device *dev;
++
++      dev = rq->queuedata;
++      if (!dev)
++              goto fail;
++
++      tap = dev_to_blktap(dev);
++      if (!blktap_active(tap))
++              goto fail;
++
++      if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
++          test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++              blktap_defer(tap);
++              return;
++      }
++
++      blktap_device_run_queue(tap);
++      return;
++
++fail:
++      while ((req = blk_fetch_request(rq))) {
++              BTERR("device closed: failing secs %llu - %llu\n",
++                    (unsigned long long)blk_rq_pos(req),
++                    (unsigned long long)blk_rq_pos(req)
++                    + blk_rq_cur_sectors(req));
++              __blk_end_request_all(req, -EIO);
++      }
++}
++
++void
++blktap_device_restart(struct blktap *tap)
++{
++      struct blktap_device *dev;
++
++      dev = &tap->device;
++
++      if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
++              blktap_defer(tap);
++              return;
++      }
++
++      if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
++          test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++              blktap_defer(tap);
++              return;
++      }
++
++      spin_lock_irq(&dev->lock);
++
++      /* Re-enable calldowns. */
++      if (dev->gd) {
++              struct request_queue *rq = dev->gd->queue;
++
++              if (blk_queue_stopped(rq))
++                      blk_start_queue(rq);
++
++              /* Kick things off immediately. */
++              blktap_device_do_request(rq);
++      }
++
++      spin_unlock_irq(&dev->lock);
++}
++
++static void
++blktap_device_configure(struct blktap *tap)
++{
++      struct request_queue *rq;
++      struct blktap_device *dev = &tap->device;
++
++      if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
++              return;
++
++      dev = &tap->device;
++      rq  = dev->gd->queue;
++
++      spin_lock_irq(&dev->lock);
++
++      set_capacity(dev->gd, tap->params.capacity);
++
++      /* Hard sector size and max sectors impersonate the equiv. hardware. */
++      blk_queue_logical_block_size(rq, tap->params.sector_size);
++      blk_queue_max_hw_sectors(rq, 512);
++
++      /* Each segment in a request is up to an aligned page in size. */
++      blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
++      blk_queue_max_segment_size(rq, PAGE_SIZE);
++
++      /* Ensure a merged request will fit in a single I/O ring slot. */
++      blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++
++      /* Make sure buffer addresses are sector-aligned. */
++      blk_queue_dma_alignment(rq, 511);
++
++      spin_unlock_irq(&dev->lock);
++}
++
++int
++blktap_device_resume(struct blktap *tap)
++{
++      int err;
++
++      if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
++              return -ENODEV;
++
++      if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++              return 0;
++
++      err = blktap_ring_resume(tap);
++      if (err)
++              return err;
++
++      /* device size may have changed */
++      blktap_device_configure(tap);
++
++      BTDBG("restarting device\n");
++      blktap_device_restart(tap);
++
++      return 0;
++}
++
++int
++blktap_device_pause(struct blktap *tap)
++{
++      unsigned long flags;
++      struct blktap_device *dev = &tap->device;
++
++      if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
++              return -ENODEV;
++
++      if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++              return 0;
++
++      spin_lock_irqsave(&dev->lock, flags);
++
++      blk_stop_queue(dev->gd->queue);
++      set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++
++      spin_unlock_irqrestore(&dev->lock, flags);
++
++      return blktap_ring_pause(tap);
++}
++
++int
++blktap_device_destroy(struct blktap *tap)
++{
++      struct blktap_device *dev = &tap->device;
++      struct gendisk *gd = dev->gd;
++
++      if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++              return 0;
++
++      BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
++
++      if (dev->users)
++              return -EBUSY;
++
++      spin_lock_irq(&dev->lock);
++      /* No more blktap_device_do_request(). */
++      blk_stop_queue(gd->queue);
++      clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++      dev->gd = NULL;
++      spin_unlock_irq(&dev->lock);
++
++#ifdef ENABLE_PASSTHROUGH
++      if (dev->bdev)
++              blktap_device_close_bdev(tap);
++#endif
++
++      del_gendisk(gd);
++      blk_cleanup_queue(gd->queue);
++      put_disk(gd);
++
++      wake_up(&tap->wq);
++
++      return 0;
++}
++
++static char *blktap_devnode(struct gendisk *gd, mode_t *mode)
++{
++      return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "tapdev%u",
++                       gd->first_minor);
++}
++
++int
++blktap_device_create(struct blktap *tap)
++{
++      int minor, err;
++      struct gendisk *gd;
++      struct request_queue *rq;
++      struct blktap_device *dev;
++
++      gd    = NULL;
++      rq    = NULL;
++      dev   = &tap->device;
++      minor = tap->minor;
++
++      if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++              return -EEXIST;
++
++      if (blktap_validate_params(tap, &tap->params))
++              return -EINVAL;
++
++      BTINFO("minor %d sectors %Lu sector-size %lu\n",
++             minor, tap->params.capacity, tap->params.sector_size);
++
++      err = -ENODEV;
++
++      gd = alloc_disk(1);
++      if (!gd)
++              goto error;
++
++      if (minor < 26)
++              sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
++      else
++              sprintf(gd->disk_name, "tapdev%c%c",
++                      'a' + ((minor / 26) - 1), 'a' + (minor % 26));
++
++      gd->major = blktap_device_major;
++      gd->first_minor = minor;
++      gd->devnode = blktap_devnode;
++      gd->fops = &blktap_device_file_operations;
++      gd->private_data = dev;
++
++      spin_lock_init(&dev->lock);
++      rq = blk_init_queue(blktap_device_do_request, &dev->lock);
++      if (!rq)
++              goto error;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
++      elevator_init(rq, "noop");
++#else
++      elevator_init(rq, &elevator_noop);
++#endif
++
++      gd->queue     = rq;
++      rq->queuedata = dev;
++      dev->gd       = gd;
++
++      set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++      blktap_device_configure(tap);
++
++      add_disk(gd);
++
++      err = 0;
++      goto out;
++
++ error:
++      if (gd)
++              del_gendisk(gd);
++      if (rq)
++              blk_cleanup_queue(rq);
++
++ out:
++      BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
++      return err;
++}
++
++int __init
++blktap_device_init(int *maj)
++{
++      int major;
++
++      /* Dynamically allocate a major for this device */
++      major = register_blkdev(0, "tapdev");
++      if (major < 0) {
++              BTERR("Couldn't register blktap device\n");
++              return -ENOMEM;
++      }       
++
++      blktap_device_major = *maj = major;
++      BTINFO("blktap device major %d\n", major);
++
++      return 0;
++}
++
++void
++blktap_device_free(void)
++{
++      if (blktap_device_major)
++              unregister_blkdev(blktap_device_major, "tapdev");
++}
diff --cc drivers/xen/blktap2/request.c

index 0000000,0000000..a27cf8a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2/request.c
@@@ -1,0 -1,0 +1,296 @@@
++#include <linux/spinlock.h>
++#include <xen/balloon.h>
++
++#include "blktap.h"
++
++#define MAX_BUCKETS                      8
++#define BUCKET_SIZE                      MAX_PENDING_REQS
++
++#define BLKTAP_POOL_CLOSING              1
++
++struct blktap_request_bucket;
++
++struct blktap_request_handle {
++      int                              slot;
++      uint8_t                          inuse;
++      struct blktap_request            request;
++      struct blktap_request_bucket    *bucket;
++};
++
++struct blktap_request_bucket {
++      atomic_t                         reqs_in_use;
++      struct blktap_request_handle     handles[BUCKET_SIZE];
++      struct page                    **foreign_pages;
++};
++
++struct blktap_request_pool {
++      spinlock_t                       lock;
++      uint8_t                          status;
++      struct list_head                 free_list;
++      atomic_t                         reqs_in_use;
++      wait_queue_head_t                wait_queue;
++      struct blktap_request_bucket    *buckets[MAX_BUCKETS];
++};
++
++static struct blktap_request_pool pool;
++
++static inline struct blktap_request_handle *
++blktap_request_to_handle(struct blktap_request *req)
++{
++      return container_of(req, struct blktap_request_handle, request);
++}
++
++static void
++blktap_request_pool_init_request(struct blktap_request *request)
++{
++      int i;
++
++      request->usr_idx  = -1;
++      request->nr_pages = 0;
++      request->status   = BLKTAP_REQUEST_FREE;
++      INIT_LIST_HEAD(&request->free_list);
++      for (i = 0; i < ARRAY_SIZE(request->handles); i++) {
++              request->handles[i].user   = INVALID_GRANT_HANDLE;
++              request->handles[i].kernel = INVALID_GRANT_HANDLE;
++      }
++}
++
++static int
++blktap_request_pool_allocate_bucket(void)
++{
++      int i, idx;
++      unsigned long flags;
++      struct blktap_request *request;
++      struct blktap_request_handle *handle;
++      struct blktap_request_bucket *bucket;
++
++      bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL);
++      if (!bucket)
++              goto fail;
++
++      bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES);
++      if (!bucket->foreign_pages)
++              goto fail;
++
++      spin_lock_irqsave(&pool.lock, flags);
++
++      idx = -1;
++      for (i = 0; i < MAX_BUCKETS; i++) {
++              if (!pool.buckets[i]) {
++                      idx = i;
++                      pool.buckets[idx] = bucket;
++                      break;
++              }
++      }
++
++      if (idx == -1) {
++              spin_unlock_irqrestore(&pool.lock, flags);
++              goto fail;
++      }
++
++      for (i = 0; i < BUCKET_SIZE; i++) {
++              handle  = bucket->handles + i;
++              request = &handle->request;
++
++              handle->slot   = i;
++              handle->inuse  = 0;
++              handle->bucket = bucket;
++
++              blktap_request_pool_init_request(request);
++              list_add_tail(&request->free_list, &pool.free_list);
++      }
++
++      spin_unlock_irqrestore(&pool.lock, flags);
++
++      return 0;
++
++fail:
++      if (bucket && bucket->foreign_pages)
++              free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
++      kfree(bucket);
++      return -ENOMEM;
++}
++
++static void
++blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket)
++{
++      if (!bucket)
++              return;
++
++      BTDBG("freeing bucket %p\n", bucket);
++
++      free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
++      kfree(bucket);
++}
++
++struct page *
++request_to_page(struct blktap_request *req, int seg)
++{
++      struct blktap_request_handle *handle = blktap_request_to_handle(req);
++      int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
++      return handle->bucket->foreign_pages[idx];
++}
++
++int
++blktap_request_pool_shrink(void)
++{
++      int i, err;
++      unsigned long flags;
++      struct blktap_request_bucket *bucket;
++
++      err = -EAGAIN;
++
++      spin_lock_irqsave(&pool.lock, flags);
++
++      /* always keep at least one bucket */
++      for (i = 1; i < MAX_BUCKETS; i++) {
++              bucket = pool.buckets[i];
++              if (!bucket)
++                      continue;
++
++              if (atomic_read(&bucket->reqs_in_use))
++                      continue;
++
++              blktap_request_pool_free_bucket(bucket);
++              pool.buckets[i] = NULL;
++              err = 0;
++              break;
++      }
++
++      spin_unlock_irqrestore(&pool.lock, flags);
++
++      return err;
++}
++
++int
++blktap_request_pool_grow(void)
++{
++      return blktap_request_pool_allocate_bucket();
++}
++
++struct blktap_request *
++blktap_request_allocate(struct blktap *tap)
++{
++      int i;
++      uint16_t usr_idx;
++      unsigned long flags;
++      struct blktap_request *request;
++
++      usr_idx = -1;
++      request = NULL;
++
++      spin_lock_irqsave(&pool.lock, flags);
++
++      if (pool.status == BLKTAP_POOL_CLOSING)
++              goto out;
++
++      for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++)
++              if (!tap->pending_requests[i]) {
++                      usr_idx = i;
++                      break;
++              }
++
++      if (usr_idx == (uint16_t)-1)
++              goto out;
++
++      if (!list_empty(&pool.free_list)) {
++              request = list_entry(pool.free_list.next,
++                                   struct blktap_request, free_list);
++              list_del(&request->free_list);
++      }
++
++      if (request) {
++              struct blktap_request_handle *handle;
++
++              atomic_inc(&pool.reqs_in_use);
++
++              handle = blktap_request_to_handle(request);
++              atomic_inc(&handle->bucket->reqs_in_use);
++              handle->inuse = 1;
++
++              request->usr_idx = usr_idx;
++
++              tap->pending_requests[usr_idx] = request;
++              tap->pending_cnt++;
++      }
++
++out:
++      spin_unlock_irqrestore(&pool.lock, flags);
++      return request;
++}
++
++void
++blktap_request_free(struct blktap *tap, struct blktap_request *request)
++{
++      int free;
++      unsigned long flags;
++      struct blktap_request_handle *handle;
++
++      BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests));
++      handle = blktap_request_to_handle(request);
++
++      spin_lock_irqsave(&pool.lock, flags);
++
++      handle->inuse = 0;
++      tap->pending_requests[request->usr_idx] = NULL;
++      blktap_request_pool_init_request(request);
++      list_add(&request->free_list, &pool.free_list);
++      atomic_dec(&handle->bucket->reqs_in_use);
++      free = atomic_dec_and_test(&pool.reqs_in_use);
++
++      spin_unlock_irqrestore(&pool.lock, flags);
++
++      if (--tap->pending_cnt == 0)
++              wake_up_interruptible(&tap->wq);
++
++      if (free)
++              wake_up(&pool.wait_queue);
++}
++
++void
++blktap_request_pool_free(void)
++{
++      int i;
++      unsigned long flags;
++
++      spin_lock_irqsave(&pool.lock, flags);
++
++      pool.status = BLKTAP_POOL_CLOSING;
++      while (atomic_read(&pool.reqs_in_use)) {
++              spin_unlock_irqrestore(&pool.lock, flags);
++              wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use));
++              spin_lock_irqsave(&pool.lock, flags);
++      }
++
++      for (i = 0; i < MAX_BUCKETS; i++) {
++              blktap_request_pool_free_bucket(pool.buckets[i]);
++              pool.buckets[i] = NULL;
++      }
++
++      spin_unlock_irqrestore(&pool.lock, flags);
++}
++
++int __init
++blktap_request_pool_init(void)
++{
++      int i, err;
++
++      memset(&pool, 0, sizeof(pool));
++
++      spin_lock_init(&pool.lock);
++      INIT_LIST_HEAD(&pool.free_list);
++      atomic_set(&pool.reqs_in_use, 0);
++      init_waitqueue_head(&pool.wait_queue);
++
++      for (i = 0; i < 2; i++) {
++              err = blktap_request_pool_allocate_bucket();
++              if (err)
++                      goto fail;
++      }
++
++      return 0;
++
++fail:
++      blktap_request_pool_free();
++      return err;
++}
diff --cc drivers/xen/blktap2/ring.c

index 0000000,0000000..28de657

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2/ring.c
@@@ -1,0 -1,0 +1,610 @@@
++#include <linux/module.h>
++#include <linux/signal.h>
++
++#include "blktap.h"
++
++static int blktap_ring_major;
++
++static inline struct blktap *
++vma_to_blktap(struct vm_area_struct *vma)
++{
++      struct vm_foreign_map *m = vma->vm_private_data;
++      struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
++      return container_of(r, struct blktap, ring);
++}
++
++ /* 
++  * BLKTAP - immediately before the mmap area,
++  * we have a bunch of pages reserved for shared memory rings.
++  */
++#define RING_PAGES 1
++
++static int
++blktap_read_ring(struct blktap *tap)
++{
++      /* This is called to read responses from the ring. */
++      int usr_idx;
++      RING_IDX rc, rp;
++      blkif_response_t res;
++      struct blktap_ring *ring;
++      struct blktap_request *request;
++
++      down_read(&tap->tap_sem);
++
++      ring = &tap->ring;
++      if (!ring->vma) {
++              up_read(&tap->tap_sem);
++              return 0;
++      }
++
++      /* for each outstanding message on the ring  */
++      rp = ring->ring.sring->rsp_prod;
++      rmb();
++
++      for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
++              memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
++              mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
++              ++ring->ring.rsp_cons;
++
++              usr_idx = (int)res.id;
++              if (usr_idx >= MAX_PENDING_REQS ||
++                  !tap->pending_requests[usr_idx]) {
++                      BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
++                             rc, rp, usr_idx, tap->pid, ring->vma);
++                      continue;
++              }
++
++              request = tap->pending_requests[usr_idx];
++              BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
++              blktap_device_finish_request(tap, &res, request);
++      }
++
++      up_read(&tap->tap_sem);
++
++      blktap_run_deferred();
++
++      return 0;
++}
++
++static int
++blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++      /*
++       * if the page has not been mapped in by the driver then return
++       * VM_FAULT_SIGBUS to the domain.
++       */
++
++      return VM_FAULT_SIGBUS;
++}
++
++static pte_t
++blktap_ring_clear_pte(struct vm_area_struct *vma,
++                    unsigned long uvaddr,
++                    pte_t *ptep, int is_fullmm)
++{
++      pte_t copy;
++      struct blktap *tap;
++      unsigned long kvaddr;
++      struct page **map, *page;
++      struct blktap_ring *ring;
++      struct blktap_request *request;
++      struct grant_handle_pair *khandle;
++      struct gnttab_unmap_grant_ref unmap[2];
++      int offset, seg, usr_idx, count = 0;
++
++      tap  = vma_to_blktap(vma);
++      ring = &tap->ring;
++      map  = ring->foreign_map.map;
++      BUG_ON(!map);   /* TODO Should this be changed to if statement? */
++
++      /*
++       * Zap entry if the address is before the start of the grant
++       * mapped region.
++       */
++      if (uvaddr < ring->user_vstart)
++              return xen_ptep_get_and_clear_full(vma, uvaddr,
++                                                 ptep, is_fullmm);
++
++      offset  = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
++      usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
++      seg     = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
++
++      offset  = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
++      page    = map[offset];
++      if (page && PageBlkback(page)) {
++              ClearPageBlkback(page);
++              set_page_private(page, 0);
++      }
++      map[offset] = NULL;
++
++      request = tap->pending_requests[usr_idx];
++      kvaddr  = request_to_kaddr(request, seg);
++      khandle = request->handles + seg;
++
++      if (khandle->kernel != INVALID_GRANT_HANDLE) {
++              gnttab_set_unmap_op(&unmap[count], kvaddr, 
++                                  GNTMAP_host_map, khandle->kernel);
++              count++;
++
++              set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, 
++                                  INVALID_P2M_ENTRY);
++      }
++
++
++      if (khandle->user != INVALID_GRANT_HANDLE) {
++              BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++
++              copy = *ptep;
++              gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep), 
++                                  GNTMAP_host_map 
++                                  | GNTMAP_application_map 
++                                  | GNTMAP_contains_pte,
++                                  khandle->user);
++              count++;
++      } else
++              copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
++                                                 is_fullmm);
++
++      if (count)
++              if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++                                            unmap, count))
++                      BUG();
++
++      khandle->kernel = INVALID_GRANT_HANDLE;
++      khandle->user   = INVALID_GRANT_HANDLE;
++
++      return copy;
++}
++
++static void
++blktap_ring_vm_unmap(struct vm_area_struct *vma)
++{
++      struct blktap *tap = vma_to_blktap(vma);
++
++      down_write(&tap->tap_sem);
++      clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
++      clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++      clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++      up_write(&tap->tap_sem);
++}
++
++static void
++blktap_ring_vm_close(struct vm_area_struct *vma)
++{
++      struct blktap *tap = vma_to_blktap(vma);
++      struct blktap_ring *ring = &tap->ring;
++
++      blktap_ring_vm_unmap(vma);                 /* fail future requests */
++      blktap_device_fail_pending_requests(tap);  /* fail pending requests */
++      blktap_device_restart(tap);                /* fail deferred requests */
++
++      down_write(&tap->tap_sem);
++
++      zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
++
++      kfree(ring->foreign_map.map);
++      ring->foreign_map.map = NULL;
++
++      /* Free the ring page. */
++      ClearPageReserved(virt_to_page(ring->ring.sring));
++      free_page((unsigned long)ring->ring.sring);
++
++      BTINFO("unmapping ring %d\n", tap->minor);
++      ring->ring.sring = NULL;
++      ring->vma = NULL;
++
++      up_write(&tap->tap_sem);
++
++      wake_up(&tap->wq);
++}
++
++static struct vm_operations_struct blktap_ring_vm_operations = {
++      .close    = blktap_ring_vm_close,
++      .unmap    = blktap_ring_vm_unmap,
++      .fault    = blktap_ring_fault,
++      .zap_pte  = blktap_ring_clear_pte,
++};
++
++static int
++blktap_ring_open(struct inode *inode, struct file *filp)
++{
++      int idx;
++      struct blktap *tap;
++
++      idx = iminor(inode);
++      if (idx < 0 || idx >= CONFIG_XEN_NR_TAP2_DEVICES || !blktaps[idx]) {
++              BTERR("unable to open device blktap%d\n", idx);
++              return -ENODEV;
++      }
++
++      tap = blktaps[idx];
++
++      BTINFO("opening device blktap%d\n", idx);
++
++      if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
++              return -ENODEV;
++
++      /* Only one process can access ring at a time */
++      if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
++              return -EBUSY;
++
++      filp->private_data = tap;
++      BTINFO("opened device %d\n", tap->minor);
++
++      return 0;
++}
++
++static int
++blktap_ring_release(struct inode *inode, struct file *filp)
++{
++      struct blktap *tap = filp->private_data;
++
++      BTINFO("freeing device %d\n", tap->minor);
++      clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
++      filp->private_data = NULL;
++      wake_up(&tap->wq);      
++      return 0;
++}
++
++/* Note on mmap:
++ * We need to map pages to user space in a way that will allow the block
++ * subsystem set up direct IO to them.  This couldn't be done before, because
++ * there isn't really a sane way to translate a user virtual address down to a 
++ * physical address when the page belongs to another domain.
++ *
++ * My first approach was to map the page in to kernel memory, add an entry
++ * for it in the physical frame list (using alloc_lomem_region as in blkback)
++ * and then attempt to map that page up to user space.  This is disallowed
++ * by xen though, which realizes that we don't really own the machine frame
++ * underlying the physical page.
++ *
++ * The new approach is to provide explicit support for this in xen linux.
++ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
++ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
++ * from pages to actual page structs.  There is a new clause in get_user_pages
++ * that does the right thing for this sort of mapping.
++ */
++static int
++blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
++{
++      int size, err;
++      struct page **map;
++      struct blktap *tap;
++      blkif_sring_t *sring;
++      struct blktap_ring *ring;
++
++      tap   = filp->private_data;
++      ring  = &tap->ring;
++      map   = NULL;
++      sring = NULL;
++
++      if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
++              return -ENOMEM;
++
++      size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++      if (size != (MMAP_PAGES + RING_PAGES)) {
++              BTERR("you _must_ map exactly %lu pages!\n",
++                    MMAP_PAGES + RING_PAGES);
++              return -EAGAIN;
++      }
++
++      /* Allocate the fe ring. */
++      sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
++      if (!sring) {
++              BTERR("Couldn't alloc sring.\n");
++              goto fail_mem;
++      }
++
++      map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
++      if (!map) {
++              BTERR("Couldn't alloc VM_FOREIGN map.\n");
++              goto fail_mem;
++      }
++
++      SetPageReserved(virt_to_page(sring));
++    
++      SHARED_RING_INIT(sring);
++      FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
++
++      ring->ring_vstart = vma->vm_start;
++      ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
++
++      /* Map the ring pages to the start of the region and reserve it. */
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              err = vm_insert_page(vma, vma->vm_start,
++                                   virt_to_page(ring->ring.sring));
++      else
++              err = remap_pfn_range(vma, vma->vm_start,
++                                    __pa(ring->ring.sring) >> PAGE_SHIFT,
++                                    PAGE_SIZE, vma->vm_page_prot);
++      if (err) {
++              BTERR("Mapping user ring failed: %d\n", err);
++              goto fail;
++      }
++
++      /* Mark this VM as containing foreign pages, and set up mappings. */
++      ring->foreign_map.map = map;
++      vma->vm_private_data = &ring->foreign_map;
++      vma->vm_flags |= VM_FOREIGN;
++      vma->vm_flags |= VM_DONTCOPY;
++      vma->vm_flags |= VM_RESERVED;
++      vma->vm_ops = &blktap_ring_vm_operations;
++
++#ifdef CONFIG_X86
++      vma->vm_mm->context.has_foreign_mappings = 1;
++#endif
++
++      tap->pid = current->pid;
++      BTINFO("blktap: mapping pid is %d\n", tap->pid);
++
++      ring->vma = vma;
++      return 0;
++
++ fail:
++      /* Clear any active mappings. */
++      zap_page_range(vma, vma->vm_start, 
++                     vma->vm_end - vma->vm_start, NULL);
++      ClearPageReserved(virt_to_page(sring));
++ fail_mem:
++      free_page((unsigned long)sring);
++      kfree(map);
++
++      return -ENOMEM;
++}
++
++static inline void
++blktap_ring_set_message(struct blktap *tap, int msg)
++{
++      struct blktap_ring *ring = &tap->ring;
++
++      down_read(&tap->tap_sem);
++      if (ring->ring.sring)
++              ring->ring.sring->private.tapif_user.msg = msg;
++      up_read(&tap->tap_sem);
++}
++
++static long
++blktap_ring_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
++{
++      struct blktap_params params;
++      struct blktap *tap = filp->private_data;
++
++      BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
++
++      switch(cmd) {
++      case BLKTAP2_IOCTL_KICK_FE:
++              /* There are fe messages to process. */
++              return blktap_read_ring(tap);
++
++      case BLKTAP2_IOCTL_CREATE_DEVICE:
++              if (!arg)
++                      return -EINVAL;
++
++              if (copy_from_user(&params, (struct blktap_params __user *)arg,
++                                 sizeof(params))) {
++                      BTERR("failed to get params\n");
++                      return -EFAULT;
++              }
++
++              if (blktap_validate_params(tap, &params)) {
++                      BTERR("invalid params\n");
++                      return -EINVAL;
++              }
++
++              tap->params = params;
++              return blktap_device_create(tap);
++
++      case BLKTAP2_IOCTL_SET_PARAMS:
++              if (!arg)
++                      return -EINVAL;
++
++              if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++                      return -EINVAL;
++
++              if (copy_from_user(&params, (struct blktap_params __user *)arg,
++                                 sizeof(params))) {
++                      BTERR("failed to get params\n");
++                      return -EFAULT;
++              }
++
++              if (blktap_validate_params(tap, &params)) {
++                      BTERR("invalid params\n");
++                      return -EINVAL;
++              }
++
++              tap->params = params;
++              return 0;
++
++      case BLKTAP2_IOCTL_PAUSE:
++              if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
++                      return -EINVAL;
++
++              set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++              clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
++
++              blktap_ring_set_message(tap, 0);
++              wake_up_interruptible(&tap->wq);
++
++              return 0;
++
++
++      case BLKTAP2_IOCTL_REOPEN:
++              if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++                      return -EINVAL;
++
++              if (!arg)
++                      return -EINVAL;
++
++              if (copy_to_user((char __user *)arg,
++                               tap->params.name,
++                               strlen(tap->params.name) + 1))
++                      return -EFAULT;
++
++              blktap_ring_set_message(tap, 0);
++              wake_up_interruptible(&tap->wq);
++
++              return 0;
++
++      case BLKTAP2_IOCTL_RESUME:
++              if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++                      return -EINVAL;
++
++              tap->ring.response = (int)arg;
++              if (!tap->ring.response)
++                      clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
++
++              blktap_ring_set_message(tap, 0);
++              wake_up_interruptible(&tap->wq);
++
++              return 0;
++      }
++
++      return -ENOIOCTLCMD;
++}
++
++static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
++{
++      struct blktap *tap = filp->private_data;
++      struct blktap_ring *ring = &tap->ring;
++
++      poll_wait(filp, &ring->poll_wait, wait);
++      if (ring->ring.sring->private.tapif_user.msg ||
++          ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
++              RING_PUSH_REQUESTS(&ring->ring);
++              return POLLIN | POLLRDNORM;
++      }
++
++      return 0;
++}
++
++static const struct file_operations blktap_ring_file_operations = {
++      .owner    = THIS_MODULE,
++      .open     = blktap_ring_open,
++      .release  = blktap_ring_release,
++      .unlocked_ioctl = blktap_ring_ioctl,
++      .mmap     = blktap_ring_mmap,
++      .poll     = blktap_ring_poll,
++};
++
++void
++blktap_ring_kick_user(struct blktap *tap)
++{
++      wake_up_interruptible(&tap->ring.poll_wait);
++}
++
++int
++blktap_ring_resume(struct blktap *tap)
++{
++      int err;
++      struct blktap_ring *ring = &tap->ring;
++
++      if (!blktap_active(tap))
++              return -ENODEV;
++
++      if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++              return -EINVAL;
++
++      /* set shared flag for resume */
++      ring->response = 0;
++
++      blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
++      blktap_ring_kick_user(tap);
++
++      wait_event_interruptible(tap->wq, ring->response ||
++                               !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
++
++      err = ring->response;
++      ring->response = 0;
++
++      BTDBG("err: %d\n", err);
++
++      if (err)
++              return err;
++
++      if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++              return -EAGAIN;
++
++      return 0;
++}
++
++int
++blktap_ring_pause(struct blktap *tap)
++{
++      if (!blktap_active(tap))
++              return -ENODEV;
++
++      if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
++              return -EINVAL;
++
++      BTDBG("draining queue\n");
++      wait_event_interruptible(tap->wq, !tap->pending_cnt);
++      if (tap->pending_cnt)
++              return -EAGAIN;
++
++      blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
++      blktap_ring_kick_user(tap);
++
++      BTDBG("waiting for tapdisk response\n");
++      wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
++      if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
++              return -EAGAIN;
++
++      return 0;
++}
++
++int
++blktap_ring_destroy(struct blktap *tap)
++{
++      if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
++          !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
++              return 0;
++
++      BTDBG("sending tapdisk close message\n");
++      blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
++      blktap_ring_kick_user(tap);
++
++      return -EAGAIN;
++}
++
++static void
++blktap_ring_initialize(struct blktap_ring *ring, int minor)
++{
++      memset(ring, 0, sizeof(*ring));
++      init_waitqueue_head(&ring->poll_wait);
++      ring->devno = MKDEV(blktap_ring_major, minor);
++}
++
++int
++blktap_ring_create(struct blktap *tap)
++{
++      struct blktap_ring *ring = &tap->ring;
++      blktap_ring_initialize(ring, tap->minor);
++      return blktap_sysfs_create(tap);
++}
++
++int __init
++blktap_ring_init(int *major)
++{
++      int err;
++
++      err = __register_chrdev(0, 0, CONFIG_XEN_NR_TAP2_DEVICES, "blktap2",
++                              &blktap_ring_file_operations);
++      if (err < 0) {
++              BTERR("error registering blktap ring device: %d\n", err);
++              return err;
++      }
++
++      blktap_ring_major = *major = err;
++      BTINFO("blktap ring major: %d\n", blktap_ring_major);
++      return 0;
++}
++
++int
++blktap_ring_free(void)
++{
++      if (blktap_ring_major)
++              __unregister_chrdev(blktap_ring_major, 0,
++                                  CONFIG_XEN_NR_TAP2_DEVICES, "blktap2");
++
++      return 0;
++}
diff --cc drivers/xen/blktap2/sysfs.c

index 0000000,0000000..5f47753

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2/sysfs.c
@@@ -1,0 -1,0 +1,475 @@@
++#include <linux/types.h>
++#include <linux/device.h>
++#include <linux/module.h>
++
++#include "blktap.h"
++
++int blktap_debug_level = 1;
++
++static struct class *class;
++static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq);
++
++static inline void
++blktap_sysfs_get(struct blktap *tap)
++{
++      atomic_inc(&tap->ring.sysfs_refcnt);
++}
++
++static inline void
++blktap_sysfs_put(struct blktap *tap)
++{
++      if (atomic_dec_and_test(&tap->ring.sysfs_refcnt))
++              wake_up(&sysfs_wq);
++}
++
++static inline void
++blktap_sysfs_enter(struct blktap *tap)
++{
++      blktap_sysfs_get(tap);               /* pin sysfs device */
++      mutex_lock(&tap->ring.sysfs_mutex);  /* serialize sysfs operations */
++}
++
++static inline void
++blktap_sysfs_exit(struct blktap *tap)
++{
++      mutex_unlock(&tap->ring.sysfs_mutex);
++      blktap_sysfs_put(tap);
++}
++
++static ssize_t blktap_sysfs_pause_device(struct device *,
++                                       struct device_attribute *,
++                                       const char *, size_t);
++static DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device);
++static ssize_t blktap_sysfs_resume_device(struct device *,
++                                        struct device_attribute *,
++                                        const char *, size_t);
++static DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device);
++
++static ssize_t
++blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr,
++                    const char *buf, size_t size)
++{
++      int err;
++      struct blktap *tap = dev_get_drvdata(dev);
++
++      blktap_sysfs_enter(tap);
++
++      if (!tap->ring.dev ||
++          test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++              err = -ENODEV;
++              goto out;
++      }
++
++      if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++              err = -EPERM;
++              goto out;
++      }
++
++      if (size > BLKTAP2_MAX_MESSAGE_LEN) {
++              err = -ENAMETOOLONG;
++              goto out;
++      }
++
++      if (strnlen(buf, BLKTAP2_MAX_MESSAGE_LEN) >= BLKTAP2_MAX_MESSAGE_LEN) {
++              err = -EINVAL;
++              goto out;
++      }
++
++      snprintf(tap->params.name, sizeof(tap->params.name) - 1, "%s", buf);
++      err = size;
++
++out:
++      blktap_sysfs_exit(tap); 
++      return err;
++}
++
++static ssize_t
++blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr,
++                    char *buf)
++{
++      ssize_t size;
++      struct blktap *tap = dev_get_drvdata(dev);
++
++      blktap_sysfs_enter(tap);
++
++      if (!tap->ring.dev)
++              size = -ENODEV;
++      else if (tap->params.name[0])
++              size = sprintf(buf, "%s\n", tap->params.name);
++      else
++              size = sprintf(buf, "%d\n", tap->minor);
++
++      blktap_sysfs_exit(tap);
++
++      return size;
++}
++static DEVICE_ATTR(name, S_IRUSR | S_IWUSR,
++                 blktap_sysfs_get_name, blktap_sysfs_set_name);
++
++static ssize_t
++blktap_sysfs_remove_device(struct device *dev, struct device_attribute *attr,
++                         const char *buf, size_t size)
++{
++      int err;
++      struct blktap *tap = dev_get_drvdata(dev);
++
++      if (!tap->ring.dev)
++              return size;
++
++      if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++              return -EBUSY;
++
++      err = blktap_control_destroy_device(tap);
++
++      return (err ? : size);
++}
++static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
++
++static ssize_t
++blktap_sysfs_pause_device(struct device *dev, struct device_attribute *attr,
++                        const char *buf, size_t size)
++{
++      int err;
++      struct blktap *tap = dev_get_drvdata(dev);
++
++      blktap_sysfs_enter(tap);
++
++      BTDBG("pausing %u:%u: dev_inuse: %lu\n",
++            MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse);
++
++      if (!tap->ring.dev ||
++          test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++              err = -ENODEV;
++              goto out;
++      }
++
++      if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
++              err = -EBUSY;
++              goto out;
++      }
++
++      if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++              err = 0;
++              goto out;
++      }
++
++      err = blktap_device_pause(tap);
++      if (!err) {
++              device_remove_file(dev, &dev_attr_pause);
++              err = device_create_file(dev, &dev_attr_resume);
++      }
++
++out:
++      blktap_sysfs_exit(tap);
++
++      return (err ? err : size);
++}
++
++static ssize_t
++blktap_sysfs_resume_device(struct device *dev, struct device_attribute *attr,
++                         const char *buf, size_t size)
++{
++      int err;
++      struct blktap *tap = dev_get_drvdata(dev);
++
++      blktap_sysfs_enter(tap);
++
++      if (!tap->ring.dev ||
++          test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++              err = -ENODEV;
++              goto out;
++      }
++
++      if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++              err = -EINVAL;
++              goto out;
++      }
++
++      err = blktap_device_resume(tap);
++      if (!err) {
++              device_remove_file(dev, &dev_attr_resume);
++              err = device_create_file(dev, &dev_attr_pause);
++      }
++
++out:
++      blktap_sysfs_exit(tap);
++
++      BTDBG("returning %zd\n", (err ? err : size));
++      return (err ? err : size);
++}
++
++#ifdef ENABLE_PASSTHROUGH
++static ssize_t
++blktap_sysfs_enable_passthrough(struct device *dev,
++                              struct device_attribute *attr,
++                              const char *buf, size_t size)
++{
++      int err;
++      unsigned major, minor;
++      struct blktap *tap = dev_get_drvdata(dev);
++
++      BTINFO("passthrough request enabled\n");
++
++      blktap_sysfs_enter(tap);
++
++      if (!tap->ring.dev ||
++          test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
++              err = -ENODEV;
++              goto out;
++      }
++
++      if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
++              err = -EINVAL;
++              goto out;
++      }
++
++      if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
++              err = -EINVAL;
++              goto out;
++      }
++
++      err = sscanf(buf, "%x:%x", &major, &minor);
++      if (err != 2) {
++              err = -EINVAL;
++              goto out;
++      }
++
++      err = blktap_device_enable_passthrough(tap, major, minor);
++
++out:
++      blktap_sysfs_exit(tap);
++      BTDBG("returning %d\n", (err ? err : size));
++      return (err ? err : size);
++}
++#endif
++
++static ssize_t
++blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr,
++                        char *buf)
++{
++      char *tmp;
++      int i, ret;
++      struct blktap *tap = dev_get_drvdata(dev);
++
++      tmp = buf;
++      blktap_sysfs_get(tap);
++
++      if (!tap->ring.dev) {
++              ret = sprintf(tmp, "no device\n");
++              goto out;
++      }
++
++      tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n",
++                     tap->params.name, MAJOR(tap->ring.devno),
++                     MINOR(tap->ring.devno), atomic_read(&tap->refcnt),
++                     tap->dev_inuse);
++      tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, "
++                     "device users: %d\n", tap->params.capacity,
++                     tap->params.sector_size, tap->device.users);
++
++      down_read(&tap->tap_sem);
++
++      tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt);
++      for (i = 0; i < MAX_PENDING_REQS; i++) {
++              struct blktap_request *req = tap->pending_requests[i];
++              if (!req)
++                      continue;
++
++              tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, "
++                             "status: 0x%02x, pendcnt: %d, "
++                             "nr_pages: %u, op: %d, time: %lu:%lu\n",
++                             i, (unsigned long long)req->id, req->usr_idx,
++                             req->status, atomic_read(&req->pendcnt),
++                             req->nr_pages, req->operation, req->time.tv_sec,
++                             req->time.tv_usec);
++      }
++
++      up_read(&tap->tap_sem);
++      ret = (tmp - buf) + 1;
++
++out:
++      blktap_sysfs_put(tap);
++      BTDBG("%s\n", buf);
++
++      return ret;
++}
++static DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL);
++
++int
++blktap_sysfs_create(struct blktap *tap)
++{
++      struct blktap_ring *ring;
++      struct device *dev;
++      int err, state = 0;
++
++      if (!class)
++              return -ENODEV;
++
++      ring = &tap->ring;
++
++      dev = device_create(class, NULL, ring->devno, tap,
++                          "blktap%d", tap->minor);
++      if (IS_ERR(dev))
++              return PTR_ERR(dev);
++
++      ring->dev = dev;
++
++      mutex_init(&ring->sysfs_mutex);
++      atomic_set(&ring->sysfs_refcnt, 0);
++      set_bit(BLKTAP_SYSFS, &tap->dev_inuse);
++
++      err = device_create_file(dev, &dev_attr_name);
++      if (!err) {
++              ++state;
++              err = device_create_file(dev, &dev_attr_remove);
++      }
++      if (!err) {
++              ++state;
++              err = device_create_file(dev, &dev_attr_pause);
++      }
++      if (!err) {
++              ++state;
++              err = device_create_file(dev, &dev_attr_debug);
++      }
++
++      switch (state * !!err) {
++      case 3: device_remove_file(dev, &dev_attr_pause);
++      case 2: device_remove_file(dev, &dev_attr_remove);
++      case 1: device_remove_file(dev, &dev_attr_name);
++      }
++
++      return err;
++}
++
++static void
++_blktap_sysfs_destroy(struct device *dev)
++{
++      struct blktap *tap = dev_get_drvdata(dev);
++
++      device_remove_file(dev, &dev_attr_name);
++      device_remove_file(dev, &dev_attr_remove);
++      device_remove_file(dev, &dev_attr_pause);
++      device_remove_file(dev, &dev_attr_resume);
++      device_remove_file(dev, &dev_attr_debug);
++
++      device_unregister(dev);
++
++      clear_bit(BLKTAP_SYSFS, &tap->dev_inuse);
++
++      blktap_control_finish_destroy(tap);
++}
++
++int
++blktap_sysfs_destroy(struct blktap *tap)
++{
++      struct blktap_ring *ring;
++      struct device *dev;
++
++      ring = &tap->ring;
++      dev  = ring->dev;
++      if (!class || !dev)
++              return 0;
++
++      ring->dev = NULL;
++      if (wait_event_interruptible(sysfs_wq,
++                                   !atomic_read(&tap->ring.sysfs_refcnt)))
++              return -EAGAIN;
++
++      return device_schedule_callback(dev, _blktap_sysfs_destroy);
++}
++
++static ssize_t
++blktap_sysfs_show_verbosity(struct class *class, struct class_attribute *attr,
++                          char *buf)
++{
++      return sprintf(buf, "%d\n", blktap_debug_level);
++}
++
++static ssize_t
++blktap_sysfs_set_verbosity(struct class *class, struct class_attribute *attr,
++                         const char *buf, size_t size)
++{
++      int level;
++
++      if (sscanf(buf, "%d", &level) == 1) {
++              blktap_debug_level = level;
++              return size;
++      }
++
++      return -EINVAL;
++}
++static CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR,
++                blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
++
++static ssize_t
++blktap_sysfs_show_devices(struct class *class, struct class_attribute *attr,
++                        char *buf)
++{
++      int i, ret;
++      struct blktap *tap;
++
++      ret = 0;
++      for (i = 0; i < CONFIG_XEN_NR_TAP2_DEVICES; i++) {
++              tap = blktaps[i];
++              if (!tap)
++                      continue;
++
++              if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++                      continue;
++
++              ret += sprintf(buf + ret, "%d ", tap->minor);
++              ret += snprintf(buf + ret, sizeof(tap->params.name) - 1,
++                              tap->params.name);
++              ret += sprintf(buf + ret, "\n");
++      }
++
++      return ret;
++}
++static CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL);
++
++void
++blktap_sysfs_free(void)
++{
++      if (!class)
++              return;
++
++      class_remove_file(class, &class_attr_verbosity);
++      class_remove_file(class, &class_attr_devices);
++
++      class_destroy(class);
++}
++
++static char *blktap_devnode(struct device *dev, mode_t *mode)
++{
++      return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "blktap%u",
++                       MINOR(dev->devt));
++}
++
++int __init
++blktap_sysfs_init(void)
++{
++      struct class *cls;
++      int err;
++
++      if (class)
++              return -EEXIST;
++
++      cls = class_create(THIS_MODULE, "blktap2");
++      if (IS_ERR(cls))
++              return PTR_ERR(cls);
++
++      cls->devnode = blktap_devnode;
++
++      err = class_create_file(cls, &class_attr_verbosity);
++      if (!err) {
++              err = class_create_file(cls, &class_attr_devices);
++              if (err)
++                      class_remove_file(cls, &class_attr_verbosity);
++      }
++      if (!err)
++              class = cls;
++      else
++              class_destroy(cls);
++
++      return err;
++}
diff --cc drivers/xen/blktap2/wait_queue.c

index 0000000,0000000..f8995aa

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/blktap2/wait_queue.c
@@@ -1,0 -1,0 +1,40 @@@
++#include <linux/list.h>
++#include <linux/spinlock.h>
++
++#include "blktap.h"
++
++static LIST_HEAD(deferred_work_queue);
++static DEFINE_SPINLOCK(deferred_work_lock);
++
++void
++blktap_run_deferred(void)
++{
++      LIST_HEAD(queue);
++      struct blktap *tap;
++      unsigned long flags;
++
++      spin_lock_irqsave(&deferred_work_lock, flags);
++      list_splice_init(&deferred_work_queue, &queue);
++      list_for_each_entry(tap, &queue, deferred_queue)
++              clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
++      spin_unlock_irqrestore(&deferred_work_lock, flags);
++
++      while (!list_empty(&queue)) {
++              tap = list_entry(queue.next, struct blktap, deferred_queue);
++              list_del_init(&tap->deferred_queue);
++              blktap_device_restart(tap);
++      }
++}
++
++void
++blktap_defer(struct blktap *tap)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&deferred_work_lock, flags);
++      if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) {
++              set_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
++              list_add_tail(&tap->deferred_queue, &deferred_work_queue);
++      }
++      spin_unlock_irqrestore(&deferred_work_lock, flags);
++}
diff --cc drivers/xen/char/Makefile

index 0000000,0000000..13604ad

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/char/Makefile
@@@ -1,0 -1,0 +1,1 @@@
++obj-$(CONFIG_XEN_DEVMEM)      := mem.o
diff --cc drivers/xen/char/mem.c

index 0000000,0000000..fd3b7b5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/char/mem.c
@@@ -1,0 -1,0 +1,230 @@@
++/*
++ *  Originally from linux/drivers/char/mem.c
++ *
++ *  Copyright (C) 1991, 1992  Linus Torvalds
++ *
++ *  Added devfs support.
++ *    Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
++ *  Shared /dev/zero mmapping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
++ */
++
++#include <linux/mm.h>
++#include <linux/miscdevice.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/mman.h>
++#include <linux/random.h>
++#include <linux/init.h>
++#include <linux/raw.h>
++#include <linux/tty.h>
++#include <linux/capability.h>
++#include <linux/ptrace.h>
++#include <linux/device.h>
++#include <asm/pgalloc.h>
++#include <asm/uaccess.h>
++#include <asm/io.h>
++#include <asm/hypervisor.h>
++
++static inline unsigned long size_inside_page(unsigned long start,
++                                           unsigned long size)
++{
++      unsigned long sz;
++
++      sz = PAGE_SIZE - (start & (PAGE_SIZE - 1));
++
++      return min(sz, size);
++}
++
++static inline int uncached_access(struct file *file)
++{
++      if (file->f_flags & O_DSYNC)
++              return 1;
++      /* Xen sets correct MTRR type on non-RAM for us. */
++      return 0;
++}
++
++static inline int range_is_allowed(unsigned long pfn, unsigned long size)
++{
++#ifdef CONFIG_STRICT_DEVMEM
++      u64 from = ((u64)pfn) << PAGE_SHIFT;
++      u64 to = from + size;
++      u64 cursor = from;
++
++      while (cursor < to) {
++              if (!devmem_is_allowed(pfn)) {
++                      printk(KERN_INFO
++              "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
++                              current->comm, from, to);
++                      return 0;
++              }
++              cursor += PAGE_SIZE;
++              pfn++;
++      }
++#endif
++      return 1;
++}
++
++/*
++ * This funcion reads the *physical* memory. The f_pos points directly to the
++ * memory location.
++ */
++static ssize_t read_mem(struct file *file, char __user *buf,
++                      size_t count, loff_t *ppos)
++{
++      unsigned long p = *ppos;
++      ssize_t read = 0, sz;
++      void __iomem *v;
++
++      while (count > 0) {
++              unsigned long remaining;
++
++              sz = size_inside_page(p, count);
++
++              if (!range_is_allowed(p >> PAGE_SHIFT, count))
++                      return -EPERM;
++
++              v = ioremap(p, sz);
++              if (IS_ERR(v) || v == NULL) {
++                      /*
++                       * Some programs (e.g., dmidecode) groove off into
++                       * weird RAM areas where no tables can possibly exist
++                       * (because Xen will have stomped on them!). These
++                       * programs get rather upset if we let them know that
++                       * Xen failed their access, so we fake out a read of
++                       * all zeroes.
++                       */
++                      if (clear_user(buf, count))
++                              return -EFAULT;
++                      read += count;
++                      break;
++              }
++
++              remaining = copy_to_user(buf, v, sz);
++              iounmap(v);
++              if (remaining)
++                      return -EFAULT;
++
++              buf += sz;
++              p += sz;
++              count -= sz;
++              read += sz;
++      }
++
++      *ppos += read;
++      return read;
++}
++
++static ssize_t write_mem(struct file *file, const char __user *buf,
++                       size_t count, loff_t *ppos)
++{
++      unsigned long p = *ppos, ignored;
++      ssize_t written = 0, sz;
++      void __iomem *v;
++
++      while (count > 0) {
++              sz = size_inside_page(p, count);
++
++              if (!range_is_allowed(p >> PAGE_SHIFT, sz))
++                      return -EPERM;
++
++              v = ioremap(p, sz);
++              if (v == NULL)
++                      break;
++              if (IS_ERR(v)) {
++                      if (written == 0)
++                              return PTR_ERR(v);
++                      break;
++              }
++
++              ignored = copy_from_user(v, buf, sz);
++              iounmap(v);
++              if (ignored) {
++                      written += sz - ignored;
++                      if (written)
++                              break;
++                      return -EFAULT;
++              }
++              buf += sz;
++              p += sz;
++              count -= sz;
++              written += sz;
++      }
++
++      *ppos += written;
++      return written;
++}
++
++#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
++static struct vm_operations_struct mmap_mem_ops = {
++#ifdef CONFIG_HAVE_IOREMAP_PROT
++      .access = generic_access_phys
++#endif
++};
++
++static int xen_mmap_mem(struct file *file, struct vm_area_struct *vma)
++{
++      size_t size = vma->vm_end - vma->vm_start;
++
++      if (uncached_access(file))
++              vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
++
++      if (!range_is_allowed(vma->vm_pgoff, size))
++              return -EPERM;
++
++      if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
++                                              &vma->vm_page_prot))
++              return -EINVAL;
++
++      vma->vm_ops = &mmap_mem_ops;
++
++      /* We want to return the real error code, not EAGAIN. */
++      return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
++                                    size, vma->vm_page_prot, DOMID_IO);
++}
++#endif
++
++/*
++ * The memory devices use the full 32/64 bits of the offset, and so we cannot
++ * check against negative addresses: they are ok. The return value is weird,
++ * though, in that case (0).
++ *
++ * also note that seeking relative to the "end of file" isn't supported:
++ * it has no meaning, so it returns -EINVAL.
++ */
++static loff_t memory_lseek(struct file *file, loff_t offset, int orig)
++{
++      loff_t ret;
++
++      mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
++      switch (orig) {
++      case SEEK_CUR:
++              offset += file->f_pos;
++      case SEEK_SET:
++              /* to avoid userland mistaking f_pos=-9 as -EBADF=-9 */
++              if ((unsigned long long)offset >= ~0xFFFULL) {
++                      ret = -EOVERFLOW;
++                      break;
++              }
++              file->f_pos = offset;
++              ret = file->f_pos;
++              force_successful_syscall_return();
++              break;
++      default:
++              ret = -EINVAL;
++      }
++      mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
++      return ret;
++}
++
++static int open_mem(struct inode * inode, struct file * filp)
++{
++      return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
++}
++
++const struct file_operations mem_fops = {
++      .llseek         = memory_lseek,
++      .read           = read_mem,
++      .write          = write_mem,
++      .mmap           = xen_mmap_mem,
++      .open           = open_mem,
++};
diff --cc drivers/xen/console/Makefile

index 0000000,0000000..35de3e9

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/console/Makefile
@@@ -1,0 -1,0 +1,2 @@@
++
++obj-y := console.o xencons_ring.o
diff --cc drivers/xen/console/console.c

index 0000000,0000000..6871fd9

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/console/console.c
@@@ -1,0 -1,0 +1,746 @@@
++/******************************************************************************
++ * console.c
++ * 
++ * Virtual console driver.
++ * 
++ * Copyright (c) 2002-2004, K A Fraser.
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/errno.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/interrupt.h>
++#include <linux/tty.h>
++#include <linux/tty_flip.h>
++#include <linux/serial.h>
++#include <linux/major.h>
++#include <linux/ptrace.h>
++#include <linux/ioport.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <linux/init.h>
++#include <linux/console.h>
++#include <linux/sysrq.h>
++#include <linux/vt.h>
++#include <asm/io.h>
++#include <asm/irq.h>
++#include <asm/uaccess.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/event_channel.h>
++#include <asm/hypervisor.h>
++#include <xen/evtchn.h>
++#include <xen/xenbus.h>
++#include <xen/xencons.h>
++
++/*
++ * Modes:
++ *  'xencons=off'  [XC_OFF]:     Console is disabled.
++ *  'xencons=tty'  [XC_TTY]:     Console attached to '/dev/tty[0-9]+'.
++ *  'xencons=ttyS' [XC_SERIAL]:  Console attached to '/dev/ttyS[0-9]+'.
++ *  'xencons=xvc'  [XC_XVC]:     Console attached to '/dev/xvc0'.
++ *  'xencons=hvc'  [XC_HVC]:     Console attached to '/dev/hvc0'.
++ *  default:                     XC_XVC
++ * 
++ * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
++ * warnings from standard distro startup scripts.
++ */
++static enum {
++      XC_OFF, XC_TTY, XC_SERIAL, XC_XVC, XC_HVC
++} xc_mode = XC_XVC;
++static int xc_num = -1;
++
++/* /dev/xvc0 device number allocated by lanana.org. */
++#define XEN_XVC_MAJOR 204
++#define XEN_XVC_MINOR 191
++
++/* /dev/hvc0 device number */
++#define XEN_HVC_MAJOR 229
++#define XEN_HVC_MINOR 0
++
++static int __init xencons_setup(char *str)
++{
++      char *q;
++      int n;
++
++      console_use_vt = 1;
++      if (!strncmp(str, "ttyS", 4)) {
++              xc_mode = XC_SERIAL;
++              str += 4;
++      } else if (!strncmp(str, "tty", 3)) {
++              xc_mode = XC_TTY;
++              str += 3;
++              console_use_vt = 0;
++      } else if (!strncmp(str, "xvc", 3)) {
++              xc_mode = XC_XVC;
++              str += 3;
++      } else if (!strncmp(str, "hvc", 3)) {
++              xc_mode = XC_HVC;
++              str += 3;
++      } else if (!strncmp(str, "off", 3)) {
++              xc_mode = XC_OFF;
++              str += 3;
++      }
++
++      n = simple_strtol(str, &q, 10);
++      if (q != str)
++              xc_num = n;
++
++      return 1;
++}
++__setup("xencons=", xencons_setup);
++
++/* The kernel and user-land drivers share a common transmit buffer. */
++static unsigned int wbuf_size = 4096;
++#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
++static char *wbuf;
++static unsigned int wc, wp; /* write_cons, write_prod */
++
++static int __init xencons_bufsz_setup(char *str)
++{
++      unsigned int goal;
++      goal = simple_strtoul(str, NULL, 0);
++      if (goal) {
++              goal = roundup_pow_of_two(goal);
++              if (wbuf_size < goal)
++                      wbuf_size = goal;
++      }
++      return 1;
++}
++__setup("xencons_bufsz=", xencons_bufsz_setup);
++
++/* This lock protects accesses to the common transmit buffer. */
++static DEFINE_SPINLOCK(xencons_lock);
++
++/* Common transmit-kick routine. */
++static void __xencons_tx_flush(void);
++
++static struct tty_driver *xencons_driver;
++
++/******************** Kernel console driver ********************************/
++
++static void kcons_write(struct console *c, const char *s, unsigned int count)
++{
++      int           i = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&xencons_lock, flags);
++
++      while (i < count) {
++              for (; i < count; i++) {
++                      if ((wp - wc) >= (wbuf_size - 1))
++                              break;
++                      if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
++                              wbuf[WBUF_MASK(wp++)] = '\r';
++              }
++
++              __xencons_tx_flush();
++      }
++
++      spin_unlock_irqrestore(&xencons_lock, flags);
++}
++
++static void kcons_write_dom0(struct console *c, const char *s, unsigned int count)
++{
++
++      while (count > 0) {
++              int rc;
++              rc = HYPERVISOR_console_io( CONSOLEIO_write, count, (char *)s);
++              if (rc <= 0)
++                      break;
++              count -= rc;
++              s += rc;
++      }
++}
++
++static struct tty_driver *kcons_device(struct console *c, int *index)
++{
++      *index = 0;
++      return xencons_driver;
++}
++
++static struct console kcons_info = {
++      .device = kcons_device,
++      .flags  = CON_PRINTBUFFER | CON_ENABLED,
++      .index  = -1,
++};
++
++static int __init xen_console_init(void)
++{
++      if (!is_running_on_xen())
++              goto out;
++
++      if (is_initial_xendomain()) {
++              kcons_info.write = kcons_write_dom0;
++      } else {
++              if (!xen_start_info->console.domU.evtchn)
++                      goto out;
++              kcons_info.write = kcons_write;
++      }
++
++      switch (xc_mode) {
++      case XC_XVC:
++              strcpy(kcons_info.name, "xvc");
++              if (xc_num == -1)
++                      xc_num = 0;
++              break;
++
++      case XC_HVC:
++              strcpy(kcons_info.name, "hvc");
++              if (xc_num == -1)
++                      xc_num = 0;
++              if (!is_initial_xendomain())
++                      add_preferred_console(kcons_info.name, xc_num, NULL);
++              break;
++
++      case XC_SERIAL:
++              strcpy(kcons_info.name, "ttyS");
++              if (xc_num == -1)
++                      xc_num = 0;
++              break;
++
++      case XC_TTY:
++              strcpy(kcons_info.name, "tty");
++              if (xc_num == -1)
++                      xc_num = 1;
++              break;
++
++      default:
++              goto out;
++      }
++
++      wbuf = kmalloc(wbuf_size, GFP_KERNEL);
++
++      register_console(&kcons_info);
++
++ out:
++      return 0;
++}
++console_initcall(xen_console_init);
++
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++/*** Useful function for console debugging -- goes straight to Xen. ***/
++asmlinkage int xprintk(const char *fmt, ...)
++{
++      va_list args;
++      int printk_len;
++      static char printk_buf[1024];
++
++      /* Emit the output into the temporary buffer */
++      va_start(args, fmt);
++      printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
++      va_end(args);
++
++      /* Send the processed output directly to Xen. */
++      kcons_write_dom0(NULL, printk_buf, printk_len);
++
++      return 0;
++}
++#endif
++
++/*** Forcibly flush console data before dying. ***/
++void xencons_force_flush(void)
++{
++      int sz;
++
++      /* Emergency console is synchronous, so there's nothing to flush. */
++      if (!is_running_on_xen() ||
++          is_initial_xendomain() ||
++          !xen_start_info->console.domU.evtchn)
++              return;
++
++      /* Spin until console data is flushed through to the daemon. */
++      while (wc != wp) {
++              int sent = 0;
++              if ((sz = wp - wc) == 0)
++                      continue;
++              sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
++              if (sent > 0)
++                      wc += sent;
++      }
++}
++
++
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++#include <linux/screen_info.h>
++
++void __init dom0_init_screen_info(const struct dom0_vga_console_info *info, size_t size)
++{
++      /* This is drawn from a dump from vgacon:startup in
++       * standard Linux. */
++      screen_info.orig_video_mode = 3;
++      screen_info.orig_video_isVGA = 1;
++      screen_info.orig_video_lines = 25;
++      screen_info.orig_video_cols = 80;
++      screen_info.orig_video_ega_bx = 3;
++      screen_info.orig_video_points = 16;
++      screen_info.orig_y = screen_info.orig_video_lines - 1;
++
++      switch (info->video_type) {
++      case XEN_VGATYPE_TEXT_MODE_3:
++              if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
++                         + sizeof(info->u.text_mode_3))
++                      break;
++              screen_info.orig_video_lines = info->u.text_mode_3.rows;
++              screen_info.orig_video_cols = info->u.text_mode_3.columns;
++              screen_info.orig_x = info->u.text_mode_3.cursor_x;
++              screen_info.orig_y = info->u.text_mode_3.cursor_y;
++              screen_info.orig_video_points =
++                      info->u.text_mode_3.font_height;
++              break;
++
++      case XEN_VGATYPE_VESA_LFB:
++              if (size < offsetof(struct dom0_vga_console_info,
++                                  u.vesa_lfb.gbl_caps))
++                      break;
++              screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
++              screen_info.lfb_width = info->u.vesa_lfb.width;
++              screen_info.lfb_height = info->u.vesa_lfb.height;
++              screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel;
++              screen_info.lfb_base = info->u.vesa_lfb.lfb_base;
++              screen_info.lfb_size = info->u.vesa_lfb.lfb_size;
++              screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line;
++              screen_info.red_size = info->u.vesa_lfb.red_size;
++              screen_info.red_pos = info->u.vesa_lfb.red_pos;
++              screen_info.green_size = info->u.vesa_lfb.green_size;
++              screen_info.green_pos = info->u.vesa_lfb.green_pos;
++              screen_info.blue_size = info->u.vesa_lfb.blue_size;
++              screen_info.blue_pos = info->u.vesa_lfb.blue_pos;
++              screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size;
++              screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos;
++              if (size >= offsetof(struct dom0_vga_console_info,
++                                   u.vesa_lfb.gbl_caps)
++                          + sizeof(info->u.vesa_lfb.gbl_caps))
++                      screen_info.capabilities = info->u.vesa_lfb.gbl_caps;
++              if (size >= offsetof(struct dom0_vga_console_info,
++                                   u.vesa_lfb.mode_attrs)
++                          + sizeof(info->u.vesa_lfb.mode_attrs))
++                      screen_info.vesa_attributes = info->u.vesa_lfb.mode_attrs;
++              break;
++      }
++}
++#endif
++
++
++/******************** User-space console driver (/dev/console) ************/
++
++#define DRV(_d)         (_d)
++#define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) &&               \
++                       ((_tty)->index != (xc_num - 1)))
++
++static struct ktermios *xencons_termios[MAX_NR_CONSOLES];
++static struct ktermios *xencons_termios_locked[MAX_NR_CONSOLES];
++static struct tty_struct *xencons_tty;
++static int xencons_priv_irq;
++static char x_char;
++
++void xencons_rx(char *buf, unsigned len)
++{
++      int           i;
++      unsigned long flags;
++
++      spin_lock_irqsave(&xencons_lock, flags);
++      if (xencons_tty == NULL)
++              goto out;
++
++      for (i = 0; i < len; i++) {
++#ifdef CONFIG_MAGIC_SYSRQ
++              static unsigned long sysrq_requested;
++
++              if (buf[i] == '\x0f') { /* ^O */
++                      if (!sysrq_requested) {
++                              sysrq_requested = jiffies;
++                              continue; /* don't print sysrq key */
++                      }
++                      sysrq_requested = 0;
++              } else if (sysrq_requested) {
++                      unsigned long sysrq_timeout = sysrq_requested + HZ*2;
++
++                      sysrq_requested = 0;
++                      if (time_before(jiffies, sysrq_timeout)) {
++                              spin_unlock_irqrestore(&xencons_lock, flags);
++                              handle_sysrq(buf[i]);
++                              spin_lock_irqsave(&xencons_lock, flags);
++                              continue;
++                      }
++              }
++#endif
++              tty_insert_flip_char(xencons_tty, buf[i], 0);
++      }
++      tty_flip_buffer_push(xencons_tty);
++
++ out:
++      spin_unlock_irqrestore(&xencons_lock, flags);
++}
++
++static void __xencons_tx_flush(void)
++{
++      int sent, sz, work_done = 0;
++
++      if (x_char) {
++              if (is_initial_xendomain())
++                      kcons_write_dom0(NULL, &x_char, 1);
++              else
++                      while (x_char)
++                              if (xencons_ring_send(&x_char, 1) == 1)
++                                      break;
++              x_char = 0;
++              work_done = 1;
++      }
++
++      while (wc != wp) {
++              sz = wp - wc;
++              if (sz > (wbuf_size - WBUF_MASK(wc)))
++                      sz = wbuf_size - WBUF_MASK(wc);
++              if (is_initial_xendomain()) {
++                      kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
++                      wc += sz;
++              } else {
++                      sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
++                      if (sent == 0)
++                              break;
++                      wc += sent;
++              }
++              work_done = 1;
++      }
++
++      if (work_done && (xencons_tty != NULL)) {
++              wake_up_interruptible(&xencons_tty->write_wait);
++              tty_wakeup(xencons_tty);
++      }
++}
++
++void xencons_tx(void)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&xencons_lock, flags);
++      __xencons_tx_flush();
++      spin_unlock_irqrestore(&xencons_lock, flags);
++}
++
++/* Privileged receive callback and transmit kicker. */
++static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id)
++{
++      static char rbuf[16];
++      int         l;
++
++      while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
++              xencons_rx(rbuf, l);
++
++      xencons_tx();
++
++      return IRQ_HANDLED;
++}
++
++static int xencons_write_room(struct tty_struct *tty)
++{
++      return wbuf_size - (wp - wc);
++}
++
++static int xencons_chars_in_buffer(struct tty_struct *tty)
++{
++      return wp - wc;
++}
++
++static void xencons_send_xchar(struct tty_struct *tty, char ch)
++{
++      unsigned long flags;
++
++      if (DUMMY_TTY(tty))
++              return;
++
++      spin_lock_irqsave(&xencons_lock, flags);
++      x_char = ch;
++      __xencons_tx_flush();
++      spin_unlock_irqrestore(&xencons_lock, flags);
++}
++
++static void xencons_throttle(struct tty_struct *tty)
++{
++      if (DUMMY_TTY(tty))
++              return;
++
++      if (I_IXOFF(tty))
++              xencons_send_xchar(tty, STOP_CHAR(tty));
++}
++
++static void xencons_unthrottle(struct tty_struct *tty)
++{
++      if (DUMMY_TTY(tty))
++              return;
++
++      if (I_IXOFF(tty)) {
++              if (x_char != 0)
++                      x_char = 0;
++              else
++                      xencons_send_xchar(tty, START_CHAR(tty));
++      }
++}
++
++static void xencons_flush_buffer(struct tty_struct *tty)
++{
++      unsigned long flags;
++
++      if (DUMMY_TTY(tty))
++              return;
++
++      spin_lock_irqsave(&xencons_lock, flags);
++      wc = wp = 0;
++      spin_unlock_irqrestore(&xencons_lock, flags);
++}
++
++static inline int __xencons_put_char(int ch)
++{
++      char _ch = (char)ch;
++      if ((wp - wc) == wbuf_size)
++              return 0;
++      wbuf[WBUF_MASK(wp++)] = _ch;
++      return 1;
++}
++
++static int xencons_write(
++      struct tty_struct *tty,
++      const unsigned char *buf,
++      int count)
++{
++      int i;
++      unsigned long flags;
++
++      if (DUMMY_TTY(tty))
++              return count;
++
++      spin_lock_irqsave(&xencons_lock, flags);
++
++      for (i = 0; i < count; i++)
++              if (!__xencons_put_char(buf[i]))
++                      break;
++
++      if (i != 0)
++              __xencons_tx_flush();
++
++      spin_unlock_irqrestore(&xencons_lock, flags);
++
++      return i;
++}
++
++static int xencons_put_char(struct tty_struct *tty, u_char ch)
++{
++      unsigned long flags;
++      int ret;
++
++      if (DUMMY_TTY(tty))
++              return 0;
++
++      spin_lock_irqsave(&xencons_lock, flags);
++      ret = __xencons_put_char(ch);
++      spin_unlock_irqrestore(&xencons_lock, flags);
++      return ret;
++}
++
++static void xencons_flush_chars(struct tty_struct *tty)
++{
++      unsigned long flags;
++
++      if (DUMMY_TTY(tty))
++              return;
++
++      spin_lock_irqsave(&xencons_lock, flags);
++      __xencons_tx_flush();
++      spin_unlock_irqrestore(&xencons_lock, flags);
++}
++
++static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
++{
++      unsigned long orig_jiffies = jiffies;
++
++      if (DUMMY_TTY(tty))
++              return;
++
++      while (tty_chars_in_buffer(tty)) {
++              set_current_state(TASK_INTERRUPTIBLE);
++              schedule_timeout(1);
++              if (signal_pending(current))
++                      break;
++              if (timeout && time_after(jiffies, orig_jiffies + timeout))
++                      break;
++      }
++
++      set_current_state(TASK_RUNNING);
++}
++
++static int xencons_open(struct tty_struct *tty, struct file *filp)
++{
++      unsigned long flags;
++
++      if (DUMMY_TTY(tty))
++              return 0;
++
++      spin_lock_irqsave(&xencons_lock, flags);
++      tty->driver_data = NULL;
++      if (xencons_tty == NULL)
++              xencons_tty = tty;
++      __xencons_tx_flush();
++      spin_unlock_irqrestore(&xencons_lock, flags);
++
++      return 0;
++}
++
++static void xencons_close(struct tty_struct *tty, struct file *filp)
++{
++      unsigned long flags;
++
++      if (DUMMY_TTY(tty))
++              return;
++
++      mutex_lock(&tty_mutex);
++
++      if (tty->count != 1) {
++              mutex_unlock(&tty_mutex);
++              return;
++      }
++
++      /* Prevent other threads from re-opening this tty. */
++      set_bit(TTY_CLOSING, &tty->flags);
++      mutex_unlock(&tty_mutex);
++
++      tty->closing = 1;
++      tty_wait_until_sent(tty, 0);
++      tty_driver_flush_buffer(tty);
++      if (tty->ldisc->ops->flush_buffer)
++              tty->ldisc->ops->flush_buffer(tty);
++      tty->closing = 0;
++      spin_lock_irqsave(&xencons_lock, flags);
++      xencons_tty = NULL;
++      spin_unlock_irqrestore(&xencons_lock, flags);
++}
++
++static const struct tty_operations xencons_ops = {
++      .open = xencons_open,
++      .close = xencons_close,
++      .write = xencons_write,
++      .write_room = xencons_write_room,
++      .put_char = xencons_put_char,
++      .flush_chars = xencons_flush_chars,
++      .chars_in_buffer = xencons_chars_in_buffer,
++      .send_xchar = xencons_send_xchar,
++      .flush_buffer = xencons_flush_buffer,
++      .throttle = xencons_throttle,
++      .unthrottle = xencons_unthrottle,
++      .wait_until_sent = xencons_wait_until_sent,
++};
++
++static int __init xencons_init(void)
++{
++      int rc;
++
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      if (xc_mode == XC_OFF)
++              return 0;
++
++      if (!is_initial_xendomain()) {
++              rc = xencons_ring_init();
++              if (rc)
++                      return rc;
++      }
++
++      xencons_driver = alloc_tty_driver((xc_mode == XC_TTY) ?
++                                        MAX_NR_CONSOLES : 1);
++      if (xencons_driver == NULL)
++              return -ENOMEM;
++
++      DRV(xencons_driver)->name            = "xencons";
++      DRV(xencons_driver)->major           = TTY_MAJOR;
++      DRV(xencons_driver)->type            = TTY_DRIVER_TYPE_SERIAL;
++      DRV(xencons_driver)->subtype         = SERIAL_TYPE_NORMAL;
++      DRV(xencons_driver)->init_termios    = tty_std_termios;
++      DRV(xencons_driver)->flags           =
++              TTY_DRIVER_REAL_RAW |
++              TTY_DRIVER_RESET_TERMIOS;
++      DRV(xencons_driver)->termios         = xencons_termios;
++      DRV(xencons_driver)->termios_locked  = xencons_termios_locked;
++
++      switch (xc_mode) {
++      case XC_XVC:
++              DRV(xencons_driver)->name        = "xvc";
++              DRV(xencons_driver)->major       = XEN_XVC_MAJOR;
++              DRV(xencons_driver)->minor_start = XEN_XVC_MINOR;
++              DRV(xencons_driver)->name_base   = xc_num;
++              break;
++      case XC_HVC:
++              DRV(xencons_driver)->name        = "hvc";
++              DRV(xencons_driver)->major       = XEN_HVC_MAJOR;
++              DRV(xencons_driver)->minor_start = XEN_HVC_MINOR;
++              DRV(xencons_driver)->name_base   = xc_num;
++              break;
++      case XC_SERIAL:
++              DRV(xencons_driver)->name        = "ttyS";
++              DRV(xencons_driver)->minor_start = 64 + xc_num;
++              DRV(xencons_driver)->name_base   = xc_num;
++              break;
++      default:
++              DRV(xencons_driver)->name        = "tty";
++              DRV(xencons_driver)->minor_start = 1;
++              DRV(xencons_driver)->name_base   = 1;
++              break;
++      }
++
++      tty_set_operations(xencons_driver, &xencons_ops);
++
++      if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) {
++              pr_warning("WARNING: Failed to register Xen virtual "
++                         "console driver as '%s%d'\n",
++                         DRV(xencons_driver)->name,
++                         DRV(xencons_driver)->name_base);
++              put_tty_driver(xencons_driver);
++              xencons_driver = NULL;
++              return rc;
++      }
++
++      if (is_initial_xendomain()) {
++              xencons_priv_irq = bind_virq_to_irqhandler(
++                      VIRQ_CONSOLE,
++                      0,
++                      xencons_priv_interrupt,
++                      0,
++                      "console",
++                      NULL);
++              BUG_ON(xencons_priv_irq < 0);
++      }
++
++      pr_info("Xen virtual console successfully installed as %s%d\n",
++              DRV(xencons_driver)->name, xc_num);
++
++      return 0;
++}
++
++module_init(xencons_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/console/xencons_ring.c

index 0000000,0000000..be5c9d0

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/console/xencons_ring.c
@@@ -1,0 -1,0 +1,143 @@@
++/* 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/errno.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/interrupt.h>
++#include <linux/tty.h>
++#include <linux/tty_flip.h>
++#include <linux/serial.h>
++#include <linux/major.h>
++#include <linux/ptrace.h>
++#include <linux/ioport.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++
++#include <asm/hypervisor.h>
++#include <xen/evtchn.h>
++#include <xen/xencons.h>
++#include <linux/wait.h>
++#include <linux/interrupt.h>
++#include <linux/sched.h>
++#include <linux/err.h>
++#include <xen/interface/io/console.h>
++
++static int xencons_irq;
++
++static inline struct xencons_interface *xencons_interface(void)
++{
++      return mfn_to_virt(xen_start_info->console.domU.mfn);
++}
++
++static inline void notify_daemon(void)
++{
++      /* Use evtchn: this is called early, before irq is set up. */
++      notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
++}
++
++int xencons_ring_send(const char *data, unsigned len)
++{
++      int sent = 0;
++      struct xencons_interface *intf = xencons_interface();
++      XENCONS_RING_IDX cons, prod;
++
++      cons = intf->out_cons;
++      prod = intf->out_prod;
++      mb();
++      BUG_ON((prod - cons) > sizeof(intf->out));
++
++      while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
++              intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
++
++      wmb();
++      intf->out_prod = prod;
++
++      notify_daemon();
++
++      return sent;
++}
++
++static irqreturn_t handle_input(int irq, void *unused)
++{
++      struct xencons_interface *intf = xencons_interface();
++      XENCONS_RING_IDX cons, prod;
++
++      cons = intf->in_cons;
++      prod = intf->in_prod;
++      mb();
++      BUG_ON((prod - cons) > sizeof(intf->in));
++
++      while (cons != prod) {
++              xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1);
++              cons++;
++      }
++
++      mb();
++      intf->in_cons = cons;
++
++      notify_daemon();
++
++      xencons_tx();
++
++      return IRQ_HANDLED;
++}
++
++int xencons_ring_init(void)
++{
++      int irq;
++
++      if (xencons_irq)
++              unbind_from_irqhandler(xencons_irq, NULL);
++      xencons_irq = 0;
++
++      if (!is_running_on_xen() ||
++          is_initial_xendomain() ||
++          !xen_start_info->console.domU.evtchn)
++              return -ENODEV;
++
++      irq = bind_caller_port_to_irqhandler(
++              xen_start_info->console.domU.evtchn,
++              handle_input, 0, "xencons", NULL);
++      if (irq < 0) {
++              pr_err("XEN console request irq failed %i\n", irq);
++              return irq;
++      }
++
++      xencons_irq = irq;
++
++      /* In case we have in-flight data after save/restore... */
++      notify_daemon();
++
++      return 0;
++}
++
++void xencons_resume(void)
++{
++      (void)xencons_ring_init();
++}
diff --cc drivers/xen/core/Makefile

index 0000000,0000000..406183d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/Makefile
@@@ -1,0 -1,0 +1,19 @@@
++#
++# Makefile for the linux kernel.
++#
++
++obj-y := evtchn.o gnttab.o reboot.o machine_reboot.o
++
++priv-$(CONFIG_PCI)            += pci.o
++obj-$(CONFIG_XEN_PRIVILEGED_GUEST) += firmware.o pcpu.o $(priv-y)
++obj-$(CONFIG_PROC_FS)         += xen_proc.o
++obj-$(CONFIG_SYS_HYPERVISOR)  += hypervisor_sysfs.o
++obj-$(CONFIG_HOTPLUG_CPU)     += cpu_hotplug.o
++obj-$(CONFIG_XEN_SYSFS)               += xen_sysfs.o
++obj-$(CONFIG_XEN_SMPBOOT)     += smpboot.o
++obj-$(CONFIG_SMP)             += spinlock.o
++obj-$(CONFIG_KEXEC)           += machine_kexec.o
++obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
++obj-$(CONFIG_XEN_DOMCTL)      += domctl.o
++CFLAGS_domctl.o                       := -D__XEN_PUBLIC_XEN_H__ -D__XEN_PUBLIC_GRANT_TABLE_H__
++CFLAGS_domctl.o                       += -D__XEN_TOOLS__ -imacros xen/interface/domctl.h -imacros xen/interface/sysctl.h
diff --cc drivers/xen/core/acpi_memhotplug.c

index 0000000,0000000..2c20cf7

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/acpi_memhotplug.c
@@@ -1,0 -1,0 +1,192 @@@
++/*
++ *  xen_acpi_memhotplug.c - interface to notify Xen on memory device hotadd
++ *
++ *  Copyright (C) 2008, Intel corporation
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or (at
++ *  your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful, but
++ *  WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ *  General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License along
++ *  with this program; if not, write to the Free Software Foundation, Inc.,
++ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
++ *
++ */
++
++#include <xen/interface/platform.h>
++#include <asm/hypervisor.h>
++
++struct xen_hotmem_entry {
++      struct list_head hotmem_list;
++      uint64_t start;
++      uint64_t end;
++      uint32_t flags;
++      uint32_t pxm;
++};
++
++struct xen_hotmem_list {
++      struct list_head list;
++      unsigned int entry_nr;
++};
++
++static struct xen_hotmem_list xen_hotmem = {
++      .list = LIST_HEAD_INIT(xen_hotmem.list)
++};
++static DEFINE_SPINLOCK(xen_hotmem_lock);
++
++static int xen_hyper_addmem(struct xen_hotmem_entry *entry)
++{
++      xen_platform_op_t op = {
++              .cmd            = XENPF_mem_hotadd,
++              .interface_version  = XENPF_INTERFACE_VERSION,
++      };
++
++      op.u.mem_add.spfn = entry->start >> PAGE_SHIFT;
++      op.u.mem_add.epfn = entry->end >> PAGE_SHIFT;
++      op.u.mem_add.flags = entry->flags;
++      op.u.mem_add.pxm = entry->pxm;
++
++      return HYPERVISOR_platform_op(&op);
++}
++
++static int add_hotmem_entry(int pxm, uint64_t start,
++                      uint64_t length, uint32_t flags)
++{
++      struct xen_hotmem_entry *entry;
++
++      if (pxm < 0 || !length)
++              return -EINVAL;
++
++      entry = kzalloc(sizeof(struct xen_hotmem_entry), GFP_ATOMIC);
++      if (!entry)
++              return -ENOMEM;
++
++      INIT_LIST_HEAD(&entry->hotmem_list);
++      entry->start = start;
++      entry->end = start + length;
++      entry->flags = flags;
++      entry->pxm = pxm;
++
++      spin_lock(&xen_hotmem_lock);
++
++      list_add_tail(&entry->hotmem_list, &xen_hotmem.list);
++      xen_hotmem.entry_nr++;
++
++      spin_unlock(&xen_hotmem_lock);
++
++      return 0;
++}
++
++static int free_hotmem_entry(struct xen_hotmem_entry *entry)
++{
++      list_del(&entry->hotmem_list);
++      kfree(entry);
++
++      return 0;
++}
++
++static void xen_hotadd_mem_dpc(struct work_struct *work)
++{
++      struct list_head *elem, *tmp;
++      struct xen_hotmem_entry *entry;
++      unsigned long flags;
++      int ret;
++
++      spin_lock_irqsave(&xen_hotmem_lock, flags);
++      list_for_each_safe(elem, tmp, &xen_hotmem.list) {
++              entry = list_entry(elem, struct xen_hotmem_entry, hotmem_list);
++              ret = xen_hyper_addmem(entry);
++              if (ret)
++                      pr_warn("xen addmem failed with %x\n", ret);
++              free_hotmem_entry(entry);
++              xen_hotmem.entry_nr--;
++      }
++      spin_unlock_irqrestore(&xen_hotmem_lock, flags);
++}
++
++static DECLARE_WORK(xen_hotadd_mem_work, xen_hotadd_mem_dpc);
++
++static int xen_acpi_get_pxm(acpi_handle h)
++{
++      unsigned long long pxm;
++      acpi_status status;
++      acpi_handle handle;
++      acpi_handle phandle = h;
++
++      do {
++              handle = phandle;
++              status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm);
++              if (ACPI_SUCCESS(status))
++                      return pxm;
++              status = acpi_get_parent(handle, &phandle);
++      } while (ACPI_SUCCESS(status));
++
++      return -1;
++}
++
++static int xen_hotadd_memory(struct acpi_memory_device *mem_device)
++{
++      int pxm, result;
++      int num_enabled = 0;
++      struct acpi_memory_info *info;
++
++      if (!mem_device)
++              return -EINVAL;
++
++      pxm = xen_acpi_get_pxm(mem_device->device->handle);
++
++      if (pxm < 0)
++              return -EINVAL;
++
++      /*
++       * Always return success to ACPI driver, and notify hypervisor later
++       * because hypervisor will utilize the memory in memory hotadd hypercall
++       */
++      list_for_each_entry(info, &mem_device->res_list, list) {
++              if (info->enabled) { /* just sanity check...*/
++                      num_enabled++;
++                      continue;
++              }
++              /*
++               * If the memory block size is zero, please ignore it.
++               * Don't try to do the following memory hotplug flowchart.
++               */
++              if (!info->length)
++                      continue;
++
++              result = add_hotmem_entry(pxm, info->start_addr,
++                                        info->length, 0);
++              if (result)
++                      continue;
++              info->enabled = 1;
++              num_enabled++;
++      }
++
++      if (!num_enabled)
++              return -EINVAL;
++
++      schedule_work(&xen_hotadd_mem_work);
++
++      return 0;
++}
++
++static int xen_hotadd_mem_init(void)
++{
++      if (!is_initial_xendomain())
++              return -ENODEV;
++
++      return 0;
++}
++
++static void xen_hotadd_mem_exit(void)
++{
++      flush_scheduled_work();
++}
diff --cc drivers/xen/core/clockevents.c

index 0000000,0000000..7e538b2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/clockevents.c
@@@ -1,0 -1,0 +1,298 @@@
++/*
++ *    Xen clockevent functions
++ *
++ *    See arch/x86/xen/time.c for copyright and credits for derived
++ *    portions of this file.
++ *
++ * Xen clockevent implementation
++ *
++ * Xen has two clockevent implementations:
++ *
++ * The old timer_op one works with all released versions of Xen prior
++ * to version 3.0.4.  This version of the hypervisor provides a
++ * single-shot timer with nanosecond resolution.  However, sharing the
++ * same event channel is a 100Hz tick which is delivered while the
++ * vcpu is running.  We don't care about or use this tick, but it will
++ * cause the core time code to think the timer fired too soon, and
++ * will end up resetting it each time.  It could be filtered, but
++ * doing so has complications when the ktime clocksource is not yet
++ * the xen clocksource (ie, at boot time).
++ *
++ * The new vcpu_op-based timer interface allows the tick timer period
++ * to be changed or turned off.  The tick timer is not useful as a
++ * periodic timer because events are only delivered to running vcpus.
++ * The one-shot timer can report when a timeout is in the past, so
++ * set_next_event is capable of returning -ETIME when appropriate.
++ * This interface is used when available.
++ */
++#include <linux/clockchips.h>
++#include <linux/interrupt.h>
++#include <linux/kernel.h>
++#include <linux/kernel_stat.h>
++#include <linux/math64.h>
++#include <asm/hypervisor.h>
++#include <xen/clock.h>
++#include <xen/evtchn.h>
++#include <xen/interface/vcpu.h>
++
++#define XEN_SHIFT 22
++
++/* Xen may fire a timer up to this many ns early */
++#define TIMER_SLOP    100000
++#define NS_PER_TICK   (1000000000LL / HZ)
++
++/*
++ * Get a hypervisor absolute time.  In theory we could maintain an
++ * offset between the kernel's time and the hypervisor's time, and
++ * apply that to a kernel's absolute timeout.  Unfortunately the
++ * hypervisor and kernel times can drift even if the kernel is using
++ * the Xen clocksource, because ntp can warp the kernel's clocksource.
++ */
++static u64 get_abs_timeout(unsigned long delta)
++{
++      return xen_local_clock() + delta;
++}
++
++#if CONFIG_XEN_COMPAT <= 0x030004
++static void timerop_set_mode(enum clock_event_mode mode,
++                           struct clock_event_device *evt)
++{
++      switch (mode) {
++      case CLOCK_EVT_MODE_PERIODIC:
++              WARN_ON(1); /* unsupported */
++              break;
++
++      case CLOCK_EVT_MODE_ONESHOT:
++      case CLOCK_EVT_MODE_RESUME:
++              break;
++
++      case CLOCK_EVT_MODE_UNUSED:
++      case CLOCK_EVT_MODE_SHUTDOWN:
++              if (HYPERVISOR_set_timer_op(0)) /* cancel timeout */
++                      BUG();
++              break;
++      }
++}
++
++static int timerop_set_next_event(unsigned long delta,
++                                struct clock_event_device *evt)
++{
++      WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
++
++      if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
++              BUG();
++
++      /*
++       * We may have missed the deadline, but there's no real way of
++       * knowing for sure.  If the event was in the past, then we'll
++       * get an immediate interrupt.
++       */
++
++      return 0;
++}
++#endif
++
++static void vcpuop_set_mode(enum clock_event_mode mode,
++                          struct clock_event_device *evt)
++{
++      switch (mode) {
++      case CLOCK_EVT_MODE_PERIODIC:
++              WARN_ON(1); /* unsupported */
++              break;
++
++      case CLOCK_EVT_MODE_UNUSED:
++      case CLOCK_EVT_MODE_SHUTDOWN:
++              if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer,
++                                     smp_processor_id(), NULL))
++                      BUG();
++              /* fall through */
++      case CLOCK_EVT_MODE_ONESHOT:
++              if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
++                                     smp_processor_id(), NULL))
++                      BUG();
++              break;
++
++      case CLOCK_EVT_MODE_RESUME:
++              break;
++      }
++}
++
++static int vcpuop_set_next_event(unsigned long delta,
++                               struct clock_event_device *evt)
++{
++      struct vcpu_set_singleshot_timer single;
++      int ret;
++
++      WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
++
++      single.timeout_abs_ns = get_abs_timeout(delta);
++      single.flags = VCPU_SSHOTTMR_future;
++
++      ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer,
++                               smp_processor_id(), &single);
++
++      BUG_ON(ret != 0 && ret != -ETIME);
++
++      return ret;
++}
++
++static DEFINE_PER_CPU(struct clock_event_device, xen_clock_event) = {
++      .name           = "xen",
++      .features       = CLOCK_EVT_FEAT_ONESHOT,
++
++      .max_delta_ns   = 0xffffffff,
++      .min_delta_ns   = TIMER_SLOP,
++
++      .mult           = 1,
++      .shift          = 0,
++      .rating         = 500,
++
++      .irq            = -1,
++};
++
++/* snapshots of runstate info */
++static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
++
++/* unused ns of stolen and blocked time */
++static DEFINE_PER_CPU(unsigned int, xen_residual_stolen);
++static DEFINE_PER_CPU(unsigned int, xen_residual_blocked);
++
++static void init_missing_ticks_accounting(unsigned int cpu)
++{
++      per_cpu(xen_runstate_snapshot, cpu) = *setup_runstate_area(cpu);
++      if (cpu == smp_processor_id())
++              get_runstate_snapshot(&__get_cpu_var(xen_runstate_snapshot));
++      per_cpu(xen_residual_stolen, cpu) = 0;
++      per_cpu(xen_residual_blocked, cpu) = 0;
++}
++
++static irqreturn_t timer_interrupt(int irq, void *dev_id)
++{
++      struct clock_event_device *evt = &__get_cpu_var(xen_clock_event);
++      struct vcpu_runstate_info state, *snap;
++      s64 blocked, stolen;
++      irqreturn_t ret = IRQ_NONE;
++
++      if (evt->event_handler) {
++              evt->event_handler(evt);
++              ret = IRQ_HANDLED;
++      }
++
++      xen_check_wallclock_update();
++
++      get_runstate_snapshot(&state);
++      snap = &__get_cpu_var(xen_runstate_snapshot);
++
++      stolen = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]
++              + state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]
++              + percpu_read(xen_residual_stolen);
++
++      if (stolen >= NS_PER_TICK)
++              account_steal_ticks(div_u64_rem(stolen, NS_PER_TICK,
++                                  &__get_cpu_var(xen_residual_stolen)));
++      else
++              percpu_write(xen_residual_stolen, stolen > 0 ? stolen : 0);
++
++      blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]
++              + percpu_read(xen_residual_blocked);
++
++      if (blocked >= NS_PER_TICK)
++              account_idle_ticks(div_u64_rem(blocked, NS_PER_TICK,
++                                 &__get_cpu_var(xen_residual_blocked)));
++      else
++              percpu_write(xen_residual_blocked, blocked > 0 ? blocked : 0);
++
++      *snap = state;
++
++      return ret;
++}
++
++static struct irqaction timer_action = {
++      .handler = timer_interrupt,
++      .flags   = IRQF_DISABLED|IRQF_TIMER,
++      .name    = "timer"
++};
++
++void __cpuinit xen_setup_cpu_clockevents(void)
++{
++      unsigned int cpu = smp_processor_id();
++      struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
++
++      init_missing_ticks_accounting(cpu);
++
++      evt->cpumask = cpumask_of(cpu);
++      clockevents_register_device(evt);
++}
++
++#ifdef CONFIG_SMP
++int __cpuinit local_setup_timer(unsigned int cpu)
++{
++      struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
++
++      BUG_ON(cpu == smp_processor_id());
++
++      evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
++      if (evt->irq < 0)
++              return evt->irq;
++      BUG_ON(per_cpu(xen_clock_event.irq, 0) != evt->irq);
++
++      evt->set_mode = percpu_read(xen_clock_event.set_mode);
++      evt->set_next_event = percpu_read(xen_clock_event.set_next_event);
++
++      return 0;
++}
++
++void __cpuinit local_teardown_timer(unsigned int cpu)
++{
++      struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
++
++      BUG_ON(cpu == 0);
++      unbind_from_per_cpu_irq(evt->irq, cpu, &timer_action);
++}
++#endif
++
++void xen_clockevents_resume(void)
++{
++      unsigned int cpu;
++
++      if (percpu_read(xen_clock_event.set_mode) != vcpuop_set_mode)
++              return;
++
++      for_each_online_cpu(cpu) {
++              init_missing_ticks_accounting(cpu);
++              if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
++                      BUG();
++      }
++}
++
++void __init xen_clockevents_init(void)
++{
++      unsigned int cpu = smp_processor_id();
++      struct clock_event_device *evt = &__get_cpu_var(xen_clock_event);
++
++      switch (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
++                                 cpu, NULL)) {
++      case 0:
++              /*
++               * Successfully turned off 100Hz tick, so we have the
++               * vcpuop-based timer interface
++               */
++              evt->set_mode = vcpuop_set_mode;
++              evt->set_next_event = vcpuop_set_next_event;
++              break;
++#if CONFIG_XEN_COMPAT <= 0x030004
++      case -ENOSYS:
++              printk(KERN_DEBUG "Xen: using timerop interface\n");
++              evt->set_mode = timerop_set_mode;
++              evt->set_next_event = timerop_set_next_event;
++              break;
++#endif
++      default:
++              BUG();
++      }
++
++      evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
++      BUG_ON(evt->irq < 0);
++
++      xen_setup_cpu_clockevents();
++}
diff --cc drivers/xen/core/cpu_hotplug.c

index 0000000,0000000..53c83e5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/cpu_hotplug.c
@@@ -1,0 -1,0 +1,182 @@@
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/kobject.h>
++#include <linux/notifier.h>
++#include <linux/cpu.h>
++#include <xen/cpu_hotplug.h>
++#include <xen/xenbus.h>
++
++/*
++ * Set of CPUs that remote admin software will allow us to bring online.
++ * Notified to us via xenbus.
++ */
++static cpumask_var_t xenbus_allowed_cpumask;
++
++/* Set of CPUs that local admin will allow us to bring online. */
++static cpumask_var_t local_allowed_cpumask;
++
++static int local_cpu_hotplug_request(void)
++{
++      /*
++       * We assume a CPU hotplug request comes from local admin if it is made
++       * via a userspace process (i.e., one with a real mm_struct).
++       */
++      return (current->mm != NULL);
++}
++
++static void __cpuinit vcpu_hotplug(unsigned int cpu, struct sys_device *dev)
++{
++      int err;
++      char dir[32], state[32];
++
++      if ((cpu >= NR_CPUS) || !cpu_possible(cpu))
++              return;
++
++      sprintf(dir, "cpu/%u", cpu);
++      err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
++      if (err != 1) {
++              pr_err("XENBUS: Unable to read cpu state\n");
++              return;
++      }
++
++      if (strcmp(state, "online") == 0) {
++              cpumask_set_cpu(cpu, xenbus_allowed_cpumask);
++              if (!cpu_up(cpu) && dev)
++                      kobject_uevent(&dev->kobj, KOBJ_ONLINE);
++      } else if (strcmp(state, "offline") == 0) {
++              cpumask_clear_cpu(cpu, xenbus_allowed_cpumask);
++              if (!cpu_down(cpu) && dev)
++                      kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
++      } else {
++              pr_err("XENBUS: unknown state(%s) on CPU%d\n",
++                     state, cpu);
++      }
++}
++
++static void __cpuinit handle_vcpu_hotplug_event(
++      struct xenbus_watch *watch, const char **vec, unsigned int len)
++{
++      unsigned int cpu;
++      char *cpustr;
++      const char *node = vec[XS_WATCH_PATH];
++
++      if ((cpustr = strstr(node, "cpu/")) != NULL) {
++              sscanf(cpustr, "cpu/%u", &cpu);
++              vcpu_hotplug(cpu, get_cpu_sysdev(cpu));
++      }
++}
++
++static int smpboot_cpu_notify(struct notifier_block *notifier,
++                            unsigned long action, void *hcpu)
++{
++      unsigned int cpu = (long)hcpu;
++
++      /*
++       * We do this in a callback notifier rather than __cpu_disable()
++       * because local_cpu_hotplug_request() does not work in the latter
++       * as it's always executed from within a stopmachine kthread.
++       */
++      if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
++              cpumask_clear_cpu(cpu, local_allowed_cpumask);
++
++      return NOTIFY_OK;
++}
++
++static int __cpuinit setup_cpu_watcher(struct notifier_block *notifier,
++                                     unsigned long event, void *data)
++{
++      unsigned int i;
++
++      static struct xenbus_watch __cpuinitdata cpu_watch = {
++              .node = "cpu",
++              .callback = handle_vcpu_hotplug_event,
++              .flags = XBWF_new_thread };
++      (void)register_xenbus_watch(&cpu_watch);
++
++      if (!is_initial_xendomain()) {
++              for_each_possible_cpu(i)
++                      vcpu_hotplug(i, get_cpu_sysdev(i));
++              pr_info("Brought up %ld CPUs\n", (long)num_online_cpus());
++      }
++
++      return NOTIFY_DONE;
++}
++
++static int __init setup_vcpu_hotplug_event(void)
++{
++      static struct notifier_block hotplug_cpu = {
++              .notifier_call = smpboot_cpu_notify };
++      static struct notifier_block __cpuinitdata xsn_cpu = {
++              .notifier_call = setup_cpu_watcher };
++
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      register_cpu_notifier(&hotplug_cpu);
++      register_xenstore_notifier(&xsn_cpu);
++
++      return 0;
++}
++
++arch_initcall(setup_vcpu_hotplug_event);
++
++int __ref smp_suspend(void)
++{
++      unsigned int cpu;
++      int err;
++
++      for_each_online_cpu(cpu) {
++              if (cpu == 0)
++                      continue;
++              err = cpu_down(cpu);
++              if (err) {
++                      pr_crit("Failed to take all CPUs down: %d\n", err);
++                      for_each_possible_cpu(cpu)
++                              vcpu_hotplug(cpu, NULL);
++                      return err;
++              }
++      }
++
++      return 0;
++}
++
++void __ref smp_resume(void)
++{
++      unsigned int cpu;
++
++      for_each_possible_cpu(cpu) {
++              if (cpu == 0)
++                      continue;
++              vcpu_hotplug(cpu, NULL);
++      }
++}
++
++int cpu_up_check(unsigned int cpu)
++{
++      int rc = 0;
++
++      if (local_cpu_hotplug_request()) {
++              cpumask_set_cpu(cpu, local_allowed_cpumask);
++              if (!cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) {
++                      pr_warning("%s: attempt to bring up CPU %u disallowed "
++                                 "by remote admin.\n", __FUNCTION__, cpu);
++                      rc = -EBUSY;
++              }
++      } else if (!cpumask_test_cpu(cpu, local_allowed_cpumask) ||
++                 !cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) {
++              rc = -EBUSY;
++      }
++
++      return rc;
++}
++
++void __init init_xenbus_allowed_cpumask(void)
++{
++      if (!alloc_cpumask_var(&xenbus_allowed_cpumask, GFP_KERNEL))
++              BUG();
++      cpumask_copy(xenbus_allowed_cpumask, cpu_present_mask);
++      if (!alloc_cpumask_var(&local_allowed_cpumask, GFP_KERNEL))
++              BUG();
++      cpumask_setall(local_allowed_cpumask);
++}
diff --cc drivers/xen/core/domctl.c

index 0000000,0000000..32770e3

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/domctl.c
@@@ -1,0 -1,0 +1,543 @@@
++/*
++ * !!!  dirty hack alert  !!!
++ *
++ * Problem: old guests kernels don't have a "protocol" node
++ *          in the frontend xenstore directory, so mixing
++ *          32 and 64bit domains doesn't work.
++ *
++ * Upstream plans to solve this in the tools, by letting them
++ * create a protocol node.  Which certainly makes sense.
++ * But it isn't trivial and isn't done yet.  Too bad.
++ *
++ * So for the time being we use the get_address_size domctl
++ * hypercall for a pretty good guess.  Not nice as the domctl
++ * hypercall isn't supposed to be used by the kernel.  Because
++ * we don't want to have dependencies between dom0 kernel and
++ * xen kernel versions.  Now we have one.  Ouch.
++ */
++#undef __XEN_PUBLIC_XEN_H__
++#undef __XEN_PUBLIC_GRANT_TABLE_H__
++#undef __XEN_TOOLS__
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/percpu.h>
++#include <asm/hypervisor.h>
++#include <xen/blkif.h>
++
++#include "domctl.h"
++
++/* stuff copied from xen/interface/domctl.h, which we can't
++ * include directly for the reasons outlined above .... */
++
++typedef struct xen_domctl_address_size {
++      uint32_t size;
++} xen_domctl_address_size_t;
++
++typedef __attribute__((aligned(8))) uint64_t uint64_aligned_t;
++
++struct xenctl_cpumap_v4 {
++      XEN_GUEST_HANDLE(uint8) bitmap;
++      uint32_t nr_cpus;
++};
++
++struct xenctl_cpumap_v5 {
++      union {
++              XEN_GUEST_HANDLE(uint8) bitmap;
++              uint64_aligned_t _align;
++      };
++      uint32_t nr_cpus;
++};
++
++struct xen_domctl_vcpuaffinity_v4 {
++    uint32_t vcpu;
++    struct xenctl_cpumap_v4 cpumap;
++};
++
++struct xen_domctl_vcpuaffinity_v5 {
++    uint32_t vcpu;
++    struct xenctl_cpumap_v5 cpumap;
++};
++
++union xen_domctl {
++      /* v4: sle10 sp1: xen 3.0.4 + 32-on-64 patches */
++      struct {
++              uint32_t cmd;
++              uint32_t interface_version;
++              domid_t  domain;
++              union {
++                      /* left out lots of other struct xen_domctl_foobar */
++                      struct xen_domctl_address_size       address_size;
++                      struct xen_domctl_vcpuaffinity_v4    vcpu_affinity;
++                      uint64_t                             dummy_align;
++                      uint8_t                              dummy_pad[128];
++              };
++      } v4;
++
++      /*
++       * v5: upstream: xen 3.1
++       * v6: upstream: xen 4.0
++       * v7: sle11 sp1: xen 4.0 + cpupools patches
++       */
++      struct {
++              uint32_t cmd;
++              uint32_t interface_version;
++              domid_t  domain;
++              union {
++                      struct xen_domctl_address_size       address_size;
++                      struct xen_domctl_vcpuaffinity_v5    vcpu_affinity;
++                      uint64_aligned_t                     dummy_align;
++                      uint8_t                              dummy_pad[128];
++              };
++      } v5, v6, v7;
++};
++
++struct xen_sysctl_physinfo_v6 {
++      uint32_t threads_per_core;
++      uint32_t cores_per_socket;
++      uint32_t nr_cpus;
++      uint32_t nr_nodes;
++      uint32_t cpu_khz;
++      uint64_aligned_t total_pages;
++      uint64_aligned_t free_pages;
++      uint64_aligned_t scrub_pages;
++      uint32_t hw_cap[8];
++      uint32_t max_cpu_id;
++      union {
++              XEN_GUEST_HANDLE(uint32) cpu_to_node;
++              uint64_aligned_t _ctn_align;
++      };
++      uint32_t capabilities;
++};
++
++struct xen_sysctl_physinfo_v7 {
++      uint32_t threads_per_core;
++      uint32_t cores_per_socket;
++      uint32_t nr_cpus;
++      uint32_t max_node_id;
++      uint32_t cpu_khz;
++      uint64_aligned_t total_pages;
++      uint64_aligned_t free_pages;
++      uint64_aligned_t scrub_pages;
++      uint32_t hw_cap[8];
++      uint32_t max_cpu_id;
++      union {
++              XEN_GUEST_HANDLE(uint32) cpu_to_node;
++              uint64_aligned_t _ctn_align;
++      };
++      uint32_t capabilities;
++};
++
++#define XEN_SYSCTL_pm_op_get_cputopo 0x20
++struct xen_get_cputopo_v6 {
++      uint32_t max_cpus;
++      union {
++              XEN_GUEST_HANDLE(uint32) cpu_to_core;
++              uint64_aligned_t _ctc_align;
++      };
++      union {
++              XEN_GUEST_HANDLE(uint32) cpu_to_socket;
++              uint64_aligned_t _cts_align;
++      };
++      uint32_t nr_cpus;
++};
++
++struct xen_sysctl_pm_op_v6 {
++      uint32_t cmd;
++      uint32_t cpuid;
++      union {
++              struct xen_get_cputopo_v6 get_topo;
++      };
++};
++#define xen_sysctl_pm_op_v7 xen_sysctl_pm_op_v6
++
++struct xen_sysctl_topologyinfo_v8 {
++      uint32_t max_cpu_index;
++      union {
++              XEN_GUEST_HANDLE(uint32) cpu_to_core;
++              uint64_aligned_t _ctc_align;
++      };
++      union {
++              XEN_GUEST_HANDLE(uint32) cpu_to_socket;
++              uint64_aligned_t _cts_align;
++      };
++      union {
++              XEN_GUEST_HANDLE(uint32) cpu_to_node;
++              uint64_aligned_t _ctn_align;
++      };
++};
++
++union xen_sysctl {
++      /* v6: Xen 3.4.x */
++      struct {
++              uint32_t cmd;
++              uint32_t interface_version;
++              union {
++                      struct xen_sysctl_physinfo_v6 physinfo;
++                      struct xen_sysctl_pm_op_v6 pm_op;
++              };
++      } v6;
++      /* v7: Xen 4.0.x */
++      struct {
++              uint32_t cmd;
++              uint32_t interface_version;
++              union {
++                      struct xen_sysctl_physinfo_v7 physinfo;
++                      struct xen_sysctl_pm_op_v7 pm_op;
++              };
++      } v7;
++      /* v8: Xen 4.1+ */
++      struct {
++              uint32_t cmd;
++              uint32_t interface_version;
++              union {
++                      struct xen_sysctl_topologyinfo_v8 topologyinfo;
++              };
++      } v8;
++};
++
++/* The actual code comes here */
++
++static inline int hypervisor_domctl(void *domctl)
++{
++      return _hypercall1(int, domctl, domctl);
++}
++
++static inline int hypervisor_sysctl(void *sysctl)
++{
++      return _hypercall1(int, sysctl, sysctl);
++}
++
++int xen_guest_address_size(int domid)
++{
++      union xen_domctl domctl;
++      int low, ret;
++
++#define guest_address_size(ver) do {                                  \
++      memset(&domctl, 0, sizeof(domctl));                             \
++      domctl.v##ver.cmd = XEN_DOMCTL_get_address_size;                \
++      domctl.v##ver.interface_version = low = ver;                    \
++      domctl.v##ver.domain = domid;                                   \
++      ret = hypervisor_domctl(&domctl) ?: domctl.v##ver.address_size.size; \
++      if (ret == 32 || ret == 64) {                                   \
++              pr_info("v" #ver " domctl worked ok: dom%d is %d-bit\n",\
++                      domid, ret);                                    \
++              return ret;                                             \
++      }                                                               \
++} while (0)
++
++      BUILD_BUG_ON(XEN_DOMCTL_INTERFACE_VERSION > 7);
++      guest_address_size(7);
++#if CONFIG_XEN_COMPAT < 0x040100
++      guest_address_size(6);
++#endif
++#if CONFIG_XEN_COMPAT < 0x040000
++      guest_address_size(5);
++#endif
++#if CONFIG_XEN_COMPAT < 0x030100
++      guest_address_size(4);
++#endif
++
++      ret = BITS_PER_LONG;
++      pr_warn("v%d...%d domctls failed, assuming dom%d is native: %d\n",
++              low, XEN_DOMCTL_INTERFACE_VERSION, domid, ret);
++
++      return ret;
++}
++EXPORT_SYMBOL_GPL(xen_guest_address_size);
++
++int xen_guest_blkif_protocol(int domid)
++{
++      int address_size = xen_guest_address_size(domid);
++
++      if (address_size == BITS_PER_LONG)
++              return BLKIF_PROTOCOL_NATIVE;
++      if (address_size == 32)
++              return BLKIF_PROTOCOL_X86_32;
++      if (address_size == 64)
++              return BLKIF_PROTOCOL_X86_64;
++      return BLKIF_PROTOCOL_NATIVE;
++}
++EXPORT_SYMBOL_GPL(xen_guest_blkif_protocol);
++
++#ifdef CONFIG_X86
++
++#define vcpuaffinity(what, ver) ({                                    \
++      memset(&domctl, 0, sizeof(domctl));                             \
++      domctl.v##ver.cmd = XEN_DOMCTL_##what##vcpuaffinity;            \
++      domctl.v##ver.interface_version = ver;                          \
++      /* domctl.v##ver.domain = 0; */                                 \
++      domctl.v##ver.vcpu_affinity.vcpu = smp_processor_id();          \
++      domctl.v##ver.vcpu_affinity.cpumap.nr_cpus = nr;                \
++      set_xen_guest_handle(domctl.v##ver.vcpu_affinity.cpumap.bitmap, \
++                           mask);                                     \
++      hypervisor_domctl(&domctl);                                     \
++})
++
++static inline int get_vcpuaffinity(unsigned int nr, void *mask)
++{
++      union xen_domctl domctl;
++      int rc;
++
++      BUILD_BUG_ON(XEN_DOMCTL_INTERFACE_VERSION > 7);
++      rc = vcpuaffinity(get, 7);
++#if CONFIG_XEN_COMPAT < 0x040100
++      if (rc)
++              rc = vcpuaffinity(get, 6);
++#endif
++#if CONFIG_XEN_COMPAT < 0x040000
++      if (rc)
++              rc = vcpuaffinity(get, 5);
++#endif
++#if CONFIG_XEN_COMPAT < 0x030100
++      if (rc)
++              rc = vcpuaffinity(get, 4);
++#endif
++      return rc;
++}
++
++static inline int set_vcpuaffinity(unsigned int nr, void *mask)
++{
++      union xen_domctl domctl;
++      int rc;
++
++      BUILD_BUG_ON(XEN_DOMCTL_INTERFACE_VERSION > 7);
++      rc = vcpuaffinity(set, 7);
++#if CONFIG_XEN_COMPAT < 0x040100
++      if (rc)
++              rc = vcpuaffinity(set, 6);
++#endif
++#if CONFIG_XEN_COMPAT < 0x040000
++      if (rc)
++              rc = vcpuaffinity(set, 5);
++#endif
++#if CONFIG_XEN_COMPAT < 0x030100
++      if (rc)
++              rc = vcpuaffinity(set, 4);
++#endif
++      return rc;
++}
++
++static DEFINE_PER_CPU(void *, saved_pcpu_affinity);
++
++#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_LONG / sizeof(long))
++
++int xen_set_physical_cpu_affinity(int pcpu)
++{
++      int rc;
++
++      if (!is_initial_xendomain())
++              return -EPERM;
++
++      if (pcpu >= 0) {
++              void *oldmap;
++
++              if (pcpu > BITS_PER_PAGE)
++                      return -ERANGE;
++
++              if (percpu_read(saved_pcpu_affinity))
++                      return -EBUSY;
++
++              oldmap = (void *)get_zeroed_page(GFP_KERNEL);
++              if (!oldmap)
++                      return -ENOMEM;
++
++              rc = get_vcpuaffinity(BITS_PER_PAGE, oldmap);
++              if (!rc) {
++                      void *newmap = kzalloc(BITS_TO_LONGS(pcpu + 1)
++                                             * sizeof(long), GFP_KERNEL);
++
++                      if (newmap) {
++                              __set_bit(pcpu, newmap);
++                              rc = set_vcpuaffinity(pcpu + 1, newmap);
++                              kfree(newmap);
++                      } else
++                              rc = -ENOMEM;
++              }
++
++              if (!rc)
++                      percpu_write(saved_pcpu_affinity, oldmap);
++              else
++                      free_page((unsigned long)oldmap);
++      } else {
++              if (!percpu_read(saved_pcpu_affinity))
++                      return 0;
++              rc = set_vcpuaffinity(BITS_PER_PAGE,
++                                    percpu_read(saved_pcpu_affinity));
++              free_page((unsigned long)percpu_read(saved_pcpu_affinity));
++              percpu_write(saved_pcpu_affinity, NULL);
++      }
++
++      return rc;
++}
++EXPORT_SYMBOL_GPL(xen_set_physical_cpu_affinity);
++
++int xen_get_topology_info(unsigned int cpu, u32 *core, u32 *sock, u32 *node)
++{
++      union xen_sysctl sysctl;
++      uint32_t *cores = NULL, *socks = NULL, *nodes = NULL;
++      unsigned int nr;
++      int rc;
++
++      if (core)
++              cores = kmalloc((cpu + 1) * sizeof(*cores), GFP_KERNEL);
++      if (sock)
++              socks = kmalloc((cpu + 1) * sizeof(*socks), GFP_KERNEL);
++      if (node)
++              nodes = kmalloc((cpu + 1) * sizeof(*nodes), GFP_KERNEL);
++      if ((core && !cores) || (sock && !socks) || (node && !nodes)) {
++              kfree(cores);
++              kfree(socks);
++              kfree(nodes);
++              return -ENOMEM;
++      }
++
++#define topologyinfo(ver) do {                                                \
++      memset(&sysctl, 0, sizeof(sysctl));                             \
++      sysctl.v##ver.cmd = XEN_SYSCTL_topologyinfo;                    \
++      sysctl.v##ver.interface_version = ver;                          \
++      sysctl.v##ver.topologyinfo.max_cpu_index = cpu;                 \
++      set_xen_guest_handle(sysctl.v##ver.topologyinfo.cpu_to_core,    \
++                           cores);                                    \
++      set_xen_guest_handle(sysctl.v##ver.topologyinfo.cpu_to_socket,  \
++                           socks);                                    \
++      set_xen_guest_handle(sysctl.v##ver.topologyinfo.cpu_to_node,    \
++                           nodes);                                    \
++      rc = hypervisor_sysctl(&sysctl);                                \
++      nr = sysctl.v##ver.topologyinfo.max_cpu_index + 1;              \
++} while (0)
++
++      BUILD_BUG_ON(XEN_SYSCTL_INTERFACE_VERSION > 8);
++      topologyinfo(8);
++
++#if CONFIG_XEN_COMPAT < 0x040100
++#define pm_op_cputopo(ver) do {                                               \
++      memset(&sysctl, 0, sizeof(sysctl));                             \
++      sysctl.v##ver.cmd = XEN_SYSCTL_pm_op;                           \
++      sysctl.v##ver.interface_version = ver;                          \
++      sysctl.v##ver.pm_op.cmd = XEN_SYSCTL_pm_op_get_cputopo;         \
++      sysctl.v##ver.pm_op.cpuid = 0;                                  \
++      sysctl.v##ver.pm_op.get_topo.max_cpus = cpu + 1;                \
++      set_xen_guest_handle(sysctl.v##ver.pm_op.get_topo.cpu_to_core,  \
++                           cores);                                    \
++      set_xen_guest_handle(sysctl.v##ver.pm_op.get_topo.cpu_to_socket,\
++                           socks);                                    \
++      rc = hypervisor_sysctl(&sysctl);                                \
++      memset(&sysctl, 0, sizeof(sysctl));                             \
++      sysctl.v##ver.cmd = XEN_SYSCTL_physinfo;                        \
++      sysctl.v##ver.interface_version = ver;                          \
++      sysctl.v##ver.physinfo.max_cpu_id = cpu;                        \
++      set_xen_guest_handle(sysctl.v##ver.physinfo.cpu_to_node, nodes);\
++      rc = hypervisor_sysctl(&sysctl) ?: rc;                          \
++      nr = sysctl.v##ver.physinfo.max_cpu_id + 1;                     \
++} while (0)
++
++      if (rc)
++              pm_op_cputopo(7);
++#endif
++#if CONFIG_XEN_COMPAT < 0x040000
++      if (rc)
++              pm_op_cputopo(6);
++#endif
++
++      if (!rc && cpu >= nr)
++              rc = -EDOM;
++
++      if (!rc && core && (*core = cores[cpu]) == INVALID_TOPOLOGY_ID)
++              rc = -ENOENT;
++      kfree(cores);
++
++      if (!rc && sock && (*sock = socks[cpu]) == INVALID_TOPOLOGY_ID)
++              rc = -ENOENT;
++      kfree(socks);
++
++      if (!rc && node && (*node = nodes[cpu]) == INVALID_TOPOLOGY_ID)
++              rc = -ENOENT;
++      kfree(nodes);
++
++      return rc;
++}
++EXPORT_SYMBOL_GPL(xen_get_topology_info);
++
++#include <xen/pcpu.h>
++#include <asm/msr.h>
++
++int rdmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no, u32 *l, u32 *h)
++{
++      int err = xen_set_physical_cpu_affinity(pcpu);
++
++      switch (err) {
++      case 0:
++              err = rdmsr_safe(msr_no, l, h);
++              WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
++              break;
++      case -EINVAL:
++              /* Fall back in case this is due to dom0_vcpus_pinned. */
++              err = rdmsr_safe_on_cpu(pcpu, msr_no, l, h) ?: 1;
++              break;
++      }
++
++      return err;
++}
++EXPORT_SYMBOL_GPL(rdmsr_safe_on_pcpu);
++
++int wrmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no, u32 l, u32 h)
++{
++      int err = xen_set_physical_cpu_affinity(pcpu);
++
++      switch (err) {
++      case 0:
++              err = wrmsr_safe(msr_no, l, h);
++              WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
++              break;
++      case -EINVAL:
++              /* Fall back in case this is due to dom0_vcpus_pinned. */
++              err = wrmsr_safe_on_cpu(pcpu, msr_no, l, h) ?: 1;
++              break;
++      }
++
++      return err;
++}
++EXPORT_SYMBOL_GPL(wrmsr_safe_on_pcpu);
++
++int rdmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs)
++{
++      int err = xen_set_physical_cpu_affinity(pcpu);
++
++      switch (err) {
++      case 0:
++              err = rdmsr_safe_regs(regs);
++              WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
++              break;
++      case -EINVAL:
++              /* Fall back in case this is due to dom0_vcpus_pinned. */
++              err = rdmsr_safe_regs_on_cpu(pcpu, regs) ?: 1;
++              break;
++      }
++
++      return err;
++}
++EXPORT_SYMBOL_GPL(rdmsr_safe_regs_on_pcpu);
++
++int wrmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs)
++{
++      int err = xen_set_physical_cpu_affinity(pcpu);
++
++      switch (err) {
++      case 0:
++              err = wrmsr_safe_regs(regs);
++              WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
++              break;
++      case -EINVAL:
++              /* Fall back in case this is due to dom0_vcpus_pinned. */
++              err = wrmsr_safe_regs_on_cpu(pcpu, regs) ?: 1;
++              break;
++      }
++
++      return err;
++}
++EXPORT_SYMBOL_GPL(wrmsr_safe_regs_on_pcpu);
++
++#endif /* CONFIG_X86 */
++
++MODULE_LICENSE("GPL");
diff --cc drivers/xen/core/domctl.h

index 0000000,0000000..e8a26a2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/domctl.h
@@@ -1,0 -1,0 +1,4 @@@
++int xen_guest_address_size(int domid);
++int xen_guest_blkif_protocol(int domid);
++int xen_set_physical_cpu_affinity(int pcpu);
++int xen_get_topology_info(unsigned int cpu, u32 *core, u32 *socket, u32 *node);
diff --cc drivers/xen/core/evtchn.c

index 0000000,0000000..a8f5cae

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/evtchn.c
@@@ -1,0 -1,0 +1,1999 @@@
++/******************************************************************************
++ * evtchn.c
++ * 
++ * Communication via Xen event channels.
++ * 
++ * Copyright (c) 2002-2005, K A Fraser
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/irq.h>
++#include <linux/interrupt.h>
++#include <linux/sched.h>
++#include <linux/kernel_stat.h>
++#include <linux/ftrace.h>
++#include <linux/version.h>
++#include <asm/atomic.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>
++#include <asm/synch_bitops.h>
++#include <xen/evtchn.h>
++#include <xen/interface/event_channel.h>
++#include <xen/interface/physdev.h>
++#include <asm/hypervisor.h>
++#include <linux/mc146818rtc.h> /* RTC_IRQ */
++#include "../../../kernel/irq/internals.h" /* IRQS_AUTODETECT, IRQS_PENDING */
++
++/*
++ * This lock protects updates to the following mapping and reference-count
++ * arrays. The lock does not need to be acquired to read the mapping tables.
++ */
++static DEFINE_SPINLOCK(irq_mapping_update_lock);
++
++/* IRQ <-> event-channel mappings. */
++static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
++      [0 ...  NR_EVENT_CHANNELS-1] = -1 };
++
++#if defined(CONFIG_SMP) && defined(CONFIG_X86)
++static struct percpu_irqaction {
++      struct irqaction action; /* must be first */
++      struct percpu_irqaction *next;
++      cpumask_var_t cpus;
++} *virq_actions[NR_VIRQS];
++/* IRQ <-> VIRQ mapping. */
++static DECLARE_BITMAP(virq_per_cpu, NR_VIRQS) __read_mostly;
++static DEFINE_PER_CPU_READ_MOSTLY(int[NR_VIRQS], virq_to_evtchn);
++#define BUG_IF_VIRQ_PER_CPU(irq_cfg) \
++      BUG_ON(type_from_irq_cfg(irq_cfg) == IRQT_VIRQ \
++             && test_bit(index_from_irq_cfg(irq_cfg), virq_per_cpu))
++#else
++#define BUG_IF_VIRQ_PER_CPU(irq_cfg) ((void)0)
++#define PER_CPU_VIRQ_IRQ
++#endif
++
++/* IRQ <-> IPI mapping. */
++#if defined(CONFIG_SMP) && defined(CONFIG_X86)
++static int __read_mostly ipi_irq = -1;
++DEFINE_PER_CPU(DECLARE_BITMAP(, NR_IPIS), ipi_pending);
++static DEFINE_PER_CPU_READ_MOSTLY(evtchn_port_t, ipi_evtchn);
++#else
++#define PER_CPU_IPI_IRQ
++#endif
++#if !defined(CONFIG_SMP) || !defined(PER_CPU_IPI_IRQ)
++#define BUG_IF_IPI(irq_cfg) BUG_ON(type_from_irq_cfg(irq_cfg) == IRQT_IPI)
++#else
++#define BUG_IF_IPI(irq_cfg) ((void)0)
++#endif
++
++/* Binding types. */
++enum {
++      IRQT_UNBOUND,
++      IRQT_PIRQ,
++      IRQT_VIRQ,
++      IRQT_IPI,
++      IRQT_LOCAL_PORT,
++      IRQT_CALLER_PORT,
++      _IRQT_COUNT
++};
++
++#define _IRQT_BITS 4
++#define _EVTCHN_BITS 12
++#define _INDEX_BITS (32 - _IRQT_BITS - _EVTCHN_BITS)
++
++/* Convenient shorthand for packed representation of an unbound IRQ. */
++#define IRQ_UNBOUND   (IRQT_UNBOUND << (32 - _IRQT_BITS))
++
++static struct irq_cfg _irq_cfg[] = {
++      [0 ...
++#ifdef CONFIG_SPARSE_IRQ
++             BUILD_BUG_ON_ZERO(PIRQ_BASE) + NR_IRQS_LEGACY
++#else
++             NR_IRQS
++#endif
++                     - 1].info = IRQ_UNBOUND
++};
++
++static inline struct irq_cfg *__pure irq_cfg(unsigned int irq)
++{
++#ifdef CONFIG_SPARSE_IRQ
++      return irq_get_chip_data(irq);
++#else
++      return irq < NR_IRQS ? _irq_cfg + irq : NULL;
++#endif
++}
++
++static inline struct irq_cfg *__pure irq_data_cfg(struct irq_data *data)
++{
++      return irq_data_get_irq_chip_data(data);
++}
++
++/* Constructor for packed IRQ information. */
++static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
++{
++      BUILD_BUG_ON(_IRQT_COUNT > (1U << _IRQT_BITS));
++
++      BUILD_BUG_ON(NR_PIRQS > (1U << _INDEX_BITS));
++      BUILD_BUG_ON(NR_VIRQS > (1U << _INDEX_BITS));
++#if defined(PER_CPU_IPI_IRQ) && defined(NR_IPIS)
++      BUILD_BUG_ON(NR_IPIS > (1U << _INDEX_BITS));
++#endif
++      BUG_ON(index >> _INDEX_BITS);
++
++      BUILD_BUG_ON(NR_EVENT_CHANNELS > (1U << _EVTCHN_BITS));
++
++      return ((type << (32 - _IRQT_BITS)) | (index << _EVTCHN_BITS) | evtchn);
++}
++
++/*
++ * Accessors for packed IRQ information.
++ */
++
++static inline unsigned int index_from_irq_cfg(const struct irq_cfg *cfg)
++{
++      return (cfg->info >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1);
++}
++
++static inline unsigned int index_from_irq(int irq)
++{
++      const struct irq_cfg *cfg = irq_cfg(irq);
++
++      return cfg ? index_from_irq_cfg(cfg) : 0;
++}
++
++static inline unsigned int type_from_irq_cfg(const struct irq_cfg *cfg)
++{
++      return cfg->info >> (32 - _IRQT_BITS);
++}
++
++static inline unsigned int type_from_irq(int irq)
++{
++      const struct irq_cfg *cfg = irq_cfg(irq);
++
++      return cfg ? type_from_irq_cfg(cfg) : IRQT_UNBOUND;
++}
++
++static inline unsigned int evtchn_from_per_cpu_irq(const struct irq_cfg *cfg,
++                                                 unsigned int cpu)
++{
++      switch (type_from_irq_cfg(cfg)) {
++#ifndef PER_CPU_VIRQ_IRQ
++      case IRQT_VIRQ:
++              return per_cpu(virq_to_evtchn, cpu)[index_from_irq_cfg(cfg)];
++#endif
++#ifndef PER_CPU_IPI_IRQ
++      case IRQT_IPI:
++              return per_cpu(ipi_evtchn, cpu);
++#endif
++      }
++      BUG();
++      return 0;
++}
++
++static inline unsigned int evtchn_from_irq_cfg(const struct irq_cfg *cfg)
++{
++      switch (type_from_irq_cfg(cfg)) {
++#ifndef PER_CPU_VIRQ_IRQ
++      case IRQT_VIRQ:
++#endif
++#ifndef PER_CPU_IPI_IRQ
++      case IRQT_IPI:
++#endif
++              return evtchn_from_per_cpu_irq(cfg, smp_processor_id());
++      }
++      return cfg->info & ((1U << _EVTCHN_BITS) - 1);
++}
++
++static inline unsigned int evtchn_from_irq_data(struct irq_data *data)
++{
++      const struct irq_cfg *cfg = irq_data_cfg(data);
++
++      return cfg ? evtchn_from_irq_cfg(cfg) : 0;
++}
++
++static inline unsigned int evtchn_from_irq(int irq)
++{
++      struct irq_data *data = irq_get_irq_data(irq);
++
++      return data ? evtchn_from_irq_data(data) : 0;
++}
++
++unsigned int irq_from_evtchn(unsigned int port)
++{
++      return evtchn_to_irq[port];
++}
++EXPORT_SYMBOL_GPL(irq_from_evtchn);
++
++/* IRQ <-> VIRQ mapping. */
++DEFINE_PER_CPU(int[NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
++
++#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
++/* IRQ <-> IPI mapping. */
++#ifndef NR_IPIS
++#define NR_IPIS 1
++#endif
++DEFINE_PER_CPU(int[NR_IPIS], ipi_to_irq) = {[0 ... NR_IPIS-1] = -1};
++#endif
++
++#ifdef CONFIG_SMP
++
++#if CONFIG_NR_CPUS <= 256
++static u8 cpu_evtchn[NR_EVENT_CHANNELS];
++#else
++static u16 cpu_evtchn[NR_EVENT_CHANNELS];
++#endif
++static DEFINE_PER_CPU(unsigned long[BITS_TO_LONGS(NR_EVENT_CHANNELS)],
++                    cpu_evtchn_mask);
++
++static inline unsigned long active_evtchns(unsigned int idx)
++{
++      shared_info_t *sh = HYPERVISOR_shared_info;
++
++      return (sh->evtchn_pending[idx] &
++              percpu_read(cpu_evtchn_mask[idx]) &
++              ~sh->evtchn_mask[idx]);
++}
++
++static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
++{
++      shared_info_t *s = HYPERVISOR_shared_info;
++      int irq = evtchn_to_irq[chn];
++
++      BUG_ON(!test_bit(chn, s->evtchn_mask));
++
++      if (irq != -1) {
++              struct irq_data *data = irq_get_irq_data(irq);
++
++              if (!irqd_is_per_cpu(data))
++                      cpumask_copy(data->affinity, cpumask_of(cpu));
++              else
++                      cpumask_set_cpu(cpu, data->affinity);
++      }
++
++      clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_evtchn[chn]));
++      set_bit(chn, per_cpu(cpu_evtchn_mask, cpu));
++      cpu_evtchn[chn] = cpu;
++}
++
++static void init_evtchn_cpu_bindings(void)
++{
++      int i;
++
++      /* By default all event channels notify CPU#0. */
++      for (i = 0; i < nr_irqs; i++) {
++              struct irq_data *data = irq_get_irq_data(i);
++
++              if (data)
++                      cpumask_copy(data->affinity, cpumask_of(0));
++      }
++
++      memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
++      for_each_possible_cpu(i)
++              memset(per_cpu(cpu_evtchn_mask, i), -!i,
++                     sizeof(per_cpu(cpu_evtchn_mask, i)));
++}
++
++static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
++{
++      return cpu_evtchn[evtchn];
++}
++
++#else
++
++static inline unsigned long active_evtchns(unsigned int idx)
++{
++      shared_info_t *sh = HYPERVISOR_shared_info;
++
++      return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
++}
++
++static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
++{
++}
++
++static void init_evtchn_cpu_bindings(void)
++{
++}
++
++static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
++{
++      return 0;
++}
++
++#endif
++
++#ifdef CONFIG_X86
++void __init xen_init_IRQ(void);
++void __init init_IRQ(void)
++{
++      irq_ctx_init(0);
++      xen_init_IRQ();
++}
++#include <asm/idle.h>
++#endif
++
++/* Xen will never allocate port zero for any purpose. */
++#define VALID_EVTCHN(chn)     ((chn) != 0)
++
++/*
++ * Force a proper event-channel callback from Xen after clearing the
++ * callback mask. We do this in a very simple manner, by making a call
++ * down into Xen. The pending flag will be checked by Xen on return.
++ */
++void force_evtchn_callback(void)
++{
++      VOID(HYPERVISOR_xen_version(0, NULL));
++}
++/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
++EXPORT_SYMBOL(force_evtchn_callback);
++
++static DEFINE_PER_CPU(unsigned int, upcall_count);
++static DEFINE_PER_CPU(unsigned int, current_l1i);
++static DEFINE_PER_CPU(unsigned int, current_l2i);
++
++#ifndef vcpu_info_xchg
++#define vcpu_info_xchg(fld, val) xchg(&current_vcpu_info()->fld, val)
++#endif
++
++#ifndef percpu_xadd
++#define percpu_xadd(var, val)                                 \
++({                                                            \
++      typeof(var) __tmp_var__;                                \
++      unsigned long flags;                                    \
++      local_irq_save(flags);                                  \
++      __tmp_var__ = get_cpu_var(var);                         \
++      __get_cpu_var(var) += (val);                            \
++      put_cpu_var(var);                                       \
++      local_irq_restore(flags);                               \
++      __tmp_var__;                                            \
++})
++#endif
++
++/* NB. Interrupts are disabled on entry. */
++asmlinkage void __irq_entry evtchn_do_upcall(struct pt_regs *regs)
++{
++      struct pt_regs     *old_regs = set_irq_regs(regs);
++      unsigned long       l1, l2;
++      unsigned long       masked_l1, masked_l2;
++      unsigned int        l1i, l2i, start_l1i, start_l2i, port, count, i;
++      int                 irq;
++
++      exit_idle();
++      irq_enter();
++
++      do {
++              /* Avoid a callback storm when we reenable delivery. */
++              vcpu_info_write(evtchn_upcall_pending, 0);
++
++              /* Nested invocations bail immediately. */
++              if (unlikely(percpu_xadd(upcall_count, 1)))
++                      break;
++
++#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
++              /* Clear master flag /before/ clearing selector flag. */
++              wmb();
++#else
++              barrier();
++#endif
++
++#ifndef CONFIG_NO_HZ
++              /*
++               * Handle timer interrupts before all others, so that all
++               * hardirq handlers see an up-to-date system time even if we
++               * have just woken from a long idle period.
++               */
++#ifdef PER_CPU_VIRQ_IRQ
++              if ((irq = percpu_read(virq_to_irq[VIRQ_TIMER])) != -1) {
++                      port = evtchn_from_irq(irq);
++#else
++              port = percpu_read(virq_to_evtchn[VIRQ_TIMER]);
++              if (VALID_EVTCHN(port)) {
++#endif
++                      l1i = port / BITS_PER_LONG;
++                      l2i = port % BITS_PER_LONG;
++                      if (active_evtchns(l1i) & (1ul<<l2i)) {
++                              mask_evtchn(port);
++                              clear_evtchn(port);
++#ifndef PER_CPU_VIRQ_IRQ
++                              irq = evtchn_to_irq[port];
++                              BUG_ON(irq == -1);
++#endif
++                              if (!handle_irq(irq, regs))
++                                      BUG();
++                      }
++              }
++#endif /* CONFIG_NO_HZ */
++
++              l1 = vcpu_info_xchg(evtchn_pending_sel, 0);
++
++              start_l1i = l1i = percpu_read(current_l1i);
++              start_l2i = percpu_read(current_l2i);
++
++              for (i = 0; l1 != 0; i++) {
++                      masked_l1 = l1 & ((~0UL) << l1i);
++                      /* If we masked out all events, wrap to beginning. */
++                      if (masked_l1 == 0) {
++                              l1i = l2i = 0;
++                              continue;
++                      }
++                      l1i = __ffs(masked_l1);
++
++                      l2 = active_evtchns(l1i);
++                      l2i = 0; /* usually scan entire word from start */
++                      if (l1i == start_l1i) {
++                              /* We scan the starting word in two parts. */
++                              if (i == 0)
++                                      /* 1st time: start in the middle */
++                                      l2i = start_l2i;
++                              else
++                                      /* 2nd time: mask bits done already */
++                                      l2 &= (1ul << start_l2i) - 1;
++                      }
++
++                      do {
++                              bool handled = false;
++
++                              masked_l2 = l2 & ((~0UL) << l2i);
++                              if (masked_l2 == 0)
++                                      break;
++                              l2i = __ffs(masked_l2);
++
++                              /* process port */
++                              port = (l1i * BITS_PER_LONG) + l2i;
++                              mask_evtchn(port);
++                              if ((irq = evtchn_to_irq[port]) != -1) {
++#ifndef PER_CPU_IPI_IRQ
++                                      if (port != percpu_read(ipi_evtchn))
++#endif
++                                              clear_evtchn(port);
++                                      handled = handle_irq(irq, regs);
++                              }
++                              if (!handled && printk_ratelimit())
++                                      pr_emerg("No handler for irq %d"
++                                               " (port %u)\n",
++                                               irq, port);
++
++                              l2i = (l2i + 1) % BITS_PER_LONG;
++
++                              /* Next caller starts at last processed + 1 */
++                              percpu_write(current_l1i,
++                                      l2i ? l1i : (l1i + 1) % BITS_PER_LONG);
++                              percpu_write(current_l2i, l2i);
++
++                      } while (l2i != 0);
++
++                      /* Scan start_l1i twice; all others once. */
++                      if ((l1i != start_l1i) || (i != 0))
++                              l1 &= ~(1UL << l1i);
++
++                      l1i = (l1i + 1) % BITS_PER_LONG;
++              }
++
++              /* If there were nested callbacks then we have more to do. */
++              count = percpu_read(upcall_count);
++              percpu_write(upcall_count, 0);
++      } while (unlikely(count != 1));
++
++      irq_exit();
++      set_irq_regs(old_regs);
++}
++
++static int find_unbound_irq(unsigned int node, struct irq_cfg **pcfg,
++                          struct irq_chip *chip, bool percpu)
++{
++      static int warned;
++      int irq;
++
++      for (irq = DYNIRQ_BASE; irq < nr_irqs; irq++) {
++              struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
++              struct irq_data *data = irq_get_irq_data(irq);
++
++              if (unlikely(!cfg))
++                      return -ENOMEM;
++              if (data->chip != &no_irq_chip &&
++                  data->chip != chip)
++                      continue;
++
++              if (!cfg->bindcount) {
++                      irq_flow_handler_t handle;
++                      const char *name;
++
++                      *pcfg = cfg;
++                      irq_set_noprobe(irq);
++                      if (!percpu) {
++                              handle = handle_fasteoi_irq;
++                              name = "fasteoi";
++                      } else {
++                              handle = handle_percpu_irq;
++                              name = "percpu";
++                      }
++                      irq_set_chip_and_handler_name(irq, chip,
++                                                    handle, name);
++                      return irq;
++              }
++      }
++
++      if (!warned) {
++              warned = 1;
++              pr_warning("No available IRQ to bind to: "
++                         "increase NR_DYNIRQS.\n");
++      }
++
++      return -ENOSPC;
++}
++
++static struct irq_chip dynirq_chip;
++
++static int bind_caller_port_to_irq(unsigned int caller_port)
++{
++      struct irq_cfg *cfg;
++      int irq;
++
++      spin_lock(&irq_mapping_update_lock);
++
++      if ((irq = evtchn_to_irq[caller_port]) == -1) {
++              if ((irq = find_unbound_irq(numa_node_id(), &cfg,
++                                          &dynirq_chip, false)) < 0)
++                      goto out;
++
++              evtchn_to_irq[caller_port] = irq;
++              cfg->info = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port);
++      } else
++              cfg = irq_cfg(irq);
++
++      cfg->bindcount++;
++
++ out:
++      spin_unlock(&irq_mapping_update_lock);
++      return irq;
++}
++
++static int bind_local_port_to_irq(unsigned int local_port)
++{
++      struct irq_cfg *cfg;
++      int irq;
++
++      spin_lock(&irq_mapping_update_lock);
++
++      BUG_ON(evtchn_to_irq[local_port] != -1);
++
++      if ((irq = find_unbound_irq(numa_node_id(), &cfg, &dynirq_chip,
++                                  false)) < 0) {
++              if (close_evtchn(local_port))
++                      BUG();
++              goto out;
++      }
++
++      evtchn_to_irq[local_port] = irq;
++      cfg->info = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port);
++      cfg->bindcount++;
++
++ out:
++      spin_unlock(&irq_mapping_update_lock);
++      return irq;
++}
++
++static int bind_listening_port_to_irq(unsigned int remote_domain)
++{
++      struct evtchn_alloc_unbound alloc_unbound;
++      int err;
++
++      alloc_unbound.dom        = DOMID_SELF;
++      alloc_unbound.remote_dom = remote_domain;
++
++      err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
++                                        &alloc_unbound);
++
++      return err ? : bind_local_port_to_irq(alloc_unbound.port);
++}
++
++static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
++                                        unsigned int remote_port)
++{
++      struct evtchn_bind_interdomain bind_interdomain;
++      int err;
++
++      bind_interdomain.remote_dom  = remote_domain;
++      bind_interdomain.remote_port = remote_port;
++
++      err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
++                                        &bind_interdomain);
++
++      return err ? : bind_local_port_to_irq(bind_interdomain.local_port);
++}
++
++static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
++{
++      struct evtchn_bind_virq bind_virq;
++      struct irq_cfg *cfg;
++      int evtchn, irq;
++
++      spin_lock(&irq_mapping_update_lock);
++
++      if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
++              if ((irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
++                                          &dynirq_chip, false)) < 0)
++                      goto out;
++
++              bind_virq.virq = virq;
++              bind_virq.vcpu = cpu;
++              if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
++                                              &bind_virq) != 0)
++                      BUG();
++              evtchn = bind_virq.port;
++
++              evtchn_to_irq[evtchn] = irq;
++#ifndef PER_CPU_VIRQ_IRQ
++              {
++                      unsigned int cpu;
++
++                      for_each_possible_cpu(cpu)
++                              per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
++              }
++#endif
++              cfg->info = mk_irq_info(IRQT_VIRQ, virq, evtchn);
++
++              per_cpu(virq_to_irq, cpu)[virq] = irq;
++
++              bind_evtchn_to_cpu(evtchn, cpu);
++      } else
++              cfg = irq_cfg(irq);
++
++      cfg->bindcount++;
++
++ out:
++      spin_unlock(&irq_mapping_update_lock);
++      return irq;
++}
++
++#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
++static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
++{
++      struct evtchn_bind_ipi bind_ipi;
++      struct irq_cfg *cfg;
++      int evtchn, irq;
++
++      spin_lock(&irq_mapping_update_lock);
++
++      if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
++              if ((irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
++                                          &dynirq_chip, false)) < 0)
++                      goto out;
++
++              bind_ipi.vcpu = cpu;
++              if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
++                                              &bind_ipi) != 0)
++                      BUG();
++              evtchn = bind_ipi.port;
++
++              evtchn_to_irq[evtchn] = irq;
++              cfg->info = mk_irq_info(IRQT_IPI, ipi, evtchn);
++
++              per_cpu(ipi_to_irq, cpu)[ipi] = irq;
++
++              bind_evtchn_to_cpu(evtchn, cpu);
++      } else
++              cfg = irq_cfg(irq);
++
++      cfg->bindcount++;
++
++ out:
++      spin_unlock(&irq_mapping_update_lock);
++      return irq;
++}
++#endif
++
++static void unbind_from_irq(unsigned int irq)
++{
++      struct irq_cfg *cfg = irq_cfg(irq);
++      int evtchn = evtchn_from_irq_cfg(cfg);
++
++      BUG_IF_VIRQ_PER_CPU(cfg);
++      BUG_IF_IPI(cfg);
++
++      spin_lock(&irq_mapping_update_lock);
++
++      if (!--cfg->bindcount && VALID_EVTCHN(evtchn)) {
++              if ((type_from_irq_cfg(cfg) != IRQT_CALLER_PORT) &&
++                  close_evtchn(evtchn))
++                      BUG();
++
++              switch (type_from_irq_cfg(cfg)) {
++              case IRQT_VIRQ:
++                      per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
++                              [index_from_irq_cfg(cfg)] = -1;
++#ifndef PER_CPU_VIRQ_IRQ
++                      {
++                              unsigned int cpu;
++
++                              for_each_possible_cpu(cpu)
++                                      per_cpu(virq_to_evtchn, cpu)
++                                              [index_from_irq_cfg(cfg)] = 0;
++                      }
++#endif
++                      break;
++#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
++              case IRQT_IPI:
++                      per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
++                              [index_from_irq_cfg(cfg)] = -1;
++                      break;
++#endif
++              default:
++                      break;
++              }
++
++              /* Closed ports are implicitly re-bound to VCPU0. */
++              bind_evtchn_to_cpu(evtchn, 0);
++
++              evtchn_to_irq[evtchn] = -1;
++              cfg->info = IRQ_UNBOUND;
++
++              dynamic_irq_cleanup(irq);
++      }
++
++      spin_unlock(&irq_mapping_update_lock);
++}
++
++#if !defined(PER_CPU_IPI_IRQ) || !defined(PER_CPU_VIRQ_IRQ)
++static inline struct percpu_irqaction *alloc_percpu_irqaction(gfp_t gfp)
++{
++      struct percpu_irqaction *new = kzalloc(sizeof(*new), GFP_ATOMIC);
++
++      if (new && !zalloc_cpumask_var(&new->cpus, gfp)) {
++              kfree(new);
++              new = NULL;
++      }
++      return new;
++}
++
++static inline void free_percpu_irqaction(struct percpu_irqaction *action)
++{
++      if (!action)
++              return;
++      free_cpumask_var(action->cpus);
++      kfree(action);
++}
++
++void unbind_from_per_cpu_irq(unsigned int irq, unsigned int cpu,
++                           struct irqaction *action)
++{
++      struct evtchn_close close;
++      struct irq_data *data = irq_get_irq_data(irq);
++      struct irq_cfg *cfg = irq_data_cfg(data);
++      int evtchn = evtchn_from_per_cpu_irq(cfg, cpu);
++      struct percpu_irqaction *free_action = NULL;
++
++      spin_lock(&irq_mapping_update_lock);
++
++      if (VALID_EVTCHN(evtchn)) {
++              mask_evtchn(evtchn);
++
++              BUG_ON(cfg->bindcount <= 1);
++              cfg->bindcount--;
++
++#ifndef PER_CPU_VIRQ_IRQ
++              if (type_from_irq_cfg(cfg) == IRQT_VIRQ) {
++                      unsigned int virq = index_from_irq_cfg(cfg);
++                      struct percpu_irqaction *cur, *prev = NULL;
++
++                      cur = virq_actions[virq];
++                      while (cur) {
++                              if (cur->action.dev_id == action) {
++                                      cpumask_clear_cpu(cpu, cur->cpus);
++                                      if (cpumask_empty(cur->cpus)) {
++                                              WARN_ON(free_action);
++                                              if (prev)
++                                                      prev->next = cur->next;
++                                              else
++                                                      virq_actions[virq]
++                                                              = cur->next;
++                                              free_action = cur;
++                                      }
++                              } else if (cpumask_test_cpu(cpu, cur->cpus))
++                                      evtchn = 0;
++                              cur = (prev = cur)->next;
++                      }
++                      if (!VALID_EVTCHN(evtchn))
++                              goto done;
++              }
++#endif
++
++              cpumask_clear_cpu(cpu, data->affinity);
++
++              close.port = evtchn;
++              if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
++                      BUG();
++
++              switch (type_from_irq_cfg(cfg)) {
++#ifndef PER_CPU_VIRQ_IRQ
++              case IRQT_VIRQ:
++                      per_cpu(virq_to_evtchn, cpu)
++                              [index_from_irq_cfg(cfg)] = 0;
++                      break;
++#endif
++#ifndef PER_CPU_IPI_IRQ
++              case IRQT_IPI:
++                      per_cpu(ipi_evtchn, cpu) = 0;
++                      break;
++#endif
++              default:
++                      BUG();
++                      break;
++              }
++
++              /* Closed ports are implicitly re-bound to VCPU0. */
++              bind_evtchn_to_cpu(evtchn, 0);
++
++              evtchn_to_irq[evtchn] = -1;
++      }
++
++#ifndef PER_CPU_VIRQ_IRQ
++done:
++#endif
++      spin_unlock(&irq_mapping_update_lock);
++
++      if (free_action) {
++              free_irq(irq, free_action->action.dev_id);
++              free_percpu_irqaction(free_action);
++      }
++}
++EXPORT_SYMBOL_GPL(unbind_from_per_cpu_irq);
++#endif /* !PER_CPU_IPI_IRQ || !PER_CPU_VIRQ_IRQ */
++
++int bind_caller_port_to_irqhandler(
++      unsigned int caller_port,
++      irq_handler_t handler,
++      unsigned long irqflags,
++      const char *devname,
++      void *dev_id)
++{
++      int irq, retval;
++
++      irq = bind_caller_port_to_irq(caller_port);
++      if (irq < 0)
++              return irq;
++
++      retval = request_irq(irq, handler, irqflags, devname, dev_id);
++      if (retval != 0) {
++              unbind_from_irq(irq);
++              return retval;
++      }
++
++      return irq;
++}
++EXPORT_SYMBOL_GPL(bind_caller_port_to_irqhandler);
++
++int bind_listening_port_to_irqhandler(
++      unsigned int remote_domain,
++      irq_handler_t handler,
++      unsigned long irqflags,
++      const char *devname,
++      void *dev_id)
++{
++      int irq, retval;
++
++      irq = bind_listening_port_to_irq(remote_domain);
++      if (irq < 0)
++              return irq;
++
++      retval = request_irq(irq, handler, irqflags, devname, dev_id);
++      if (retval != 0) {
++              unbind_from_irq(irq);
++              return retval;
++      }
++
++      return irq;
++}
++EXPORT_SYMBOL_GPL(bind_listening_port_to_irqhandler);
++
++int bind_interdomain_evtchn_to_irqhandler(
++      unsigned int remote_domain,
++      unsigned int remote_port,
++      irq_handler_t handler,
++      unsigned long irqflags,
++      const char *devname,
++      void *dev_id)
++{
++      int irq, retval;
++
++      irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
++      if (irq < 0)
++              return irq;
++
++      retval = request_irq(irq, handler, irqflags, devname, dev_id);
++      if (retval != 0) {
++              unbind_from_irq(irq);
++              return retval;
++      }
++
++      return irq;
++}
++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
++
++int bind_virq_to_irqhandler(
++      unsigned int virq,
++      unsigned int cpu,
++      irq_handler_t handler,
++      unsigned long irqflags,
++      const char *devname,
++      void *dev_id)
++{
++      int irq, retval;
++
++#ifndef PER_CPU_VIRQ_IRQ
++      BUG_ON(test_bit(virq, virq_per_cpu));
++#endif
++
++      irq = bind_virq_to_irq(virq, cpu);
++      if (irq < 0)
++              return irq;
++
++      retval = request_irq(irq, handler, irqflags, devname, dev_id);
++      if (retval != 0) {
++              unbind_from_irq(irq);
++              return retval;
++      }
++
++      return irq;
++}
++EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
++
++#ifdef CONFIG_SMP
++#ifndef PER_CPU_VIRQ_IRQ
++int bind_virq_to_irqaction(
++      unsigned int virq,
++      unsigned int cpu,
++      struct irqaction *action)
++{
++      struct evtchn_bind_virq bind_virq;
++      struct irq_cfg *cfg;
++      int evtchn, irq, retval = 0;
++      struct percpu_irqaction *cur = NULL, *new;
++
++      BUG_ON(!test_bit(virq, virq_per_cpu));
++
++      if (action->dev_id)
++              return -EINVAL;
++
++      new = alloc_percpu_irqaction(GFP_ATOMIC);
++      if (new) {
++              new->action = *action;
++              new->action.dev_id = action;
++      }
++
++      spin_lock(&irq_mapping_update_lock);
++
++      for (cur = virq_actions[virq]; cur; cur = cur->next)
++              if (cur->action.dev_id == action)
++                      break;
++      if (!cur) {
++              if (!new) {
++                      spin_unlock(&irq_mapping_update_lock);
++                      return -ENOMEM;
++              }
++              new->next = virq_actions[virq];
++              virq_actions[virq] = cur = new;
++              new = NULL;
++              retval = 1;
++      }
++      cpumask_set_cpu(cpu, cur->cpus);
++      action = &cur->action;
++
++      if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
++              unsigned int nr;
++
++              BUG_ON(!retval);
++
++              if ((irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
++                                          &dynirq_chip, true)) < 0) {
++                      virq_actions[virq] = cur->next;
++                      spin_unlock(&irq_mapping_update_lock);
++                      free_percpu_irqaction(new);
++                      return irq;
++              }
++
++              /* Extra reference so count will never drop to zero. */
++              cfg->bindcount++;
++
++              for_each_possible_cpu(nr)
++                      per_cpu(virq_to_irq, nr)[virq] = irq;
++              cfg->info = mk_irq_info(IRQT_VIRQ, virq, 0);
++      } else
++              cfg = irq_cfg(irq);
++
++      evtchn = per_cpu(virq_to_evtchn, cpu)[virq];
++      if (!VALID_EVTCHN(evtchn)) {
++              bind_virq.virq = virq;
++              bind_virq.vcpu = cpu;
++              if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
++                                              &bind_virq) != 0)
++                      BUG();
++              evtchn = bind_virq.port;
++              evtchn_to_irq[evtchn] = irq;
++              per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
++
++              bind_evtchn_to_cpu(evtchn, cpu);
++      }
++
++      cfg->bindcount++;
++
++      spin_unlock(&irq_mapping_update_lock);
++
++      free_percpu_irqaction(new);
++
++      if (retval == 0) {
++              unsigned long flags;
++
++              local_irq_save(flags);
++              unmask_evtchn(evtchn);
++              local_irq_restore(flags);
++      } else {
++              action->flags |= IRQF_PERCPU;
++              retval = setup_irq(irq, action);
++              if (retval) {
++                      unbind_from_per_cpu_irq(irq, cpu, action);
++                      BUG_ON(retval > 0);
++                      irq = retval;
++              }
++      }
++
++      return irq;
++}
++EXPORT_SYMBOL_GPL(bind_virq_to_irqaction);
++#endif
++
++#ifdef PER_CPU_IPI_IRQ
++int bind_ipi_to_irqhandler(
++      unsigned int ipi,
++      unsigned int cpu,
++      irq_handler_t handler,
++      unsigned long irqflags,
++      const char *devname,
++      void *dev_id)
++{
++      int irq, retval;
++
++      irq = bind_ipi_to_irq(ipi, cpu);
++      if (irq < 0)
++              return irq;
++
++      retval = request_irq(irq, handler, irqflags | IRQF_NO_SUSPEND,
++                           devname, dev_id);
++      if (retval != 0) {
++              unbind_from_irq(irq);
++              return retval;
++      }
++
++      return irq;
++}
++#else
++int __cpuinit bind_ipi_to_irqaction(
++      unsigned int cpu,
++      struct irqaction *action)
++{
++      struct evtchn_bind_ipi bind_ipi;
++      struct irq_cfg *cfg;
++      int evtchn, retval = 0;
++
++      spin_lock(&irq_mapping_update_lock);
++
++      if (VALID_EVTCHN(per_cpu(ipi_evtchn, cpu))) {
++              spin_unlock(&irq_mapping_update_lock);
++              return -EBUSY;
++      }
++
++      if (ipi_irq < 0) {
++              if ((ipi_irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
++                                              &dynirq_chip, true)) < 0) {
++                      spin_unlock(&irq_mapping_update_lock);
++                      return ipi_irq;
++              }
++
++              /* Extra reference so count will never drop to zero. */
++              cfg->bindcount++;
++
++              cfg->info = mk_irq_info(IRQT_IPI, 0, 0);
++              retval = 1;
++      } else
++              cfg = irq_cfg(ipi_irq);
++
++      bind_ipi.vcpu = cpu;
++      if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi))
++              BUG();
++
++      evtchn = bind_ipi.port;
++      evtchn_to_irq[evtchn] = ipi_irq;
++      per_cpu(ipi_evtchn, cpu) = evtchn;
++
++      bind_evtchn_to_cpu(evtchn, cpu);
++
++      cfg->bindcount++;
++
++      spin_unlock(&irq_mapping_update_lock);
++
++      if (retval == 0) {
++              unsigned long flags;
++
++              local_irq_save(flags);
++              unmask_evtchn(evtchn);
++              local_irq_restore(flags);
++      } else {
++              action->flags |= IRQF_PERCPU | IRQF_NO_SUSPEND;
++              retval = setup_irq(ipi_irq, action);
++              if (retval) {
++                      unbind_from_per_cpu_irq(ipi_irq, cpu, NULL);
++                      BUG_ON(retval > 0);
++                      ipi_irq = retval;
++              }
++      }
++
++      return ipi_irq;
++}
++#endif /* PER_CPU_IPI_IRQ */
++#endif /* CONFIG_SMP */
++
++void unbind_from_irqhandler(unsigned int irq, void *dev_id)
++{
++      free_irq(irq, dev_id);
++      unbind_from_irq(irq);
++}
++EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
++
++#ifdef CONFIG_SMP
++void rebind_evtchn_to_cpu(int port, unsigned int cpu)
++{
++      struct evtchn_bind_vcpu ebv = { .port = port, .vcpu = cpu };
++      int masked;
++
++      masked = test_and_set_evtchn_mask(port);
++      if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &ebv) == 0)
++              bind_evtchn_to_cpu(port, cpu);
++      if (!masked)
++              unmask_evtchn(port);
++}
++
++static void rebind_irq_to_cpu(struct irq_data *data, unsigned int tcpu)
++{
++      const struct irq_cfg *cfg = irq_data_cfg(data);
++      int evtchn = evtchn_from_irq_cfg(cfg);
++
++      BUG_IF_VIRQ_PER_CPU(cfg);
++      BUG_IF_IPI(cfg);
++
++      if (VALID_EVTCHN(evtchn))
++              rebind_evtchn_to_cpu(evtchn, tcpu);
++}
++
++static int set_affinity_irq(struct irq_data *data,
++                          const struct cpumask *dest, bool force)
++{
++      rebind_irq_to_cpu(data, cpumask_first(dest));
++
++      return 0;
++}
++#endif
++
++int resend_irq_on_evtchn(struct irq_data *data)
++{
++      int masked, evtchn = evtchn_from_irq_data(data);
++
++      if (!VALID_EVTCHN(evtchn))
++              return 1;
++
++      masked = test_and_set_evtchn_mask(evtchn);
++      set_evtchn(evtchn);
++      if (!masked)
++              unmask_evtchn(evtchn);
++
++      return 1;
++}
++
++/*
++ * Interface to generic handling in irq.c
++ */
++
++static void unmask_dynirq(struct irq_data *data)
++{
++      int evtchn = evtchn_from_irq_data(data);
++
++      if (VALID_EVTCHN(evtchn))
++              unmask_evtchn(evtchn);
++}
++
++static void mask_dynirq(struct irq_data *data)
++{
++      int evtchn = evtchn_from_irq_data(data);
++
++      if (VALID_EVTCHN(evtchn))
++              mask_evtchn(evtchn);
++}
++
++static unsigned int startup_dynirq(struct irq_data *data)
++{
++      unmask_dynirq(data);
++      return 0;
++}
++
++#define shutdown_dynirq mask_dynirq
++
++static void end_dynirq(struct irq_data *data)
++{
++      if (!irqd_irq_disabled(data)) {
++              irq_move_masked_irq(data);
++              unmask_dynirq(data);
++      }
++}
++
++static struct irq_chip dynirq_chip = {
++      .name             = "Dynamic",
++      .irq_startup      = startup_dynirq,
++      .irq_shutdown     = shutdown_dynirq,
++      .irq_enable       = unmask_dynirq,
++      .irq_disable      = mask_dynirq,
++      .irq_mask         = mask_dynirq,
++      .irq_unmask       = unmask_dynirq,
++      .irq_eoi          = end_dynirq,
++#ifdef CONFIG_SMP
++      .irq_set_affinity = set_affinity_irq,
++#endif
++      .irq_retrigger    = resend_irq_on_evtchn,
++};
++
++/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
++static bool pirq_eoi_does_unmask;
++static unsigned long *pirq_needs_eoi;
++static DECLARE_BITMAP(probing_pirq, NR_PIRQS);
++
++static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
++{
++      struct physdev_eoi eoi = { .irq = evtchn_get_xen_pirq(irq) };
++
++      if (pirq_eoi_does_unmask) {
++              if (test_bit(eoi.irq, pirq_needs_eoi))
++                      VOID(HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi));
++              else
++                      unmask_evtchn(evtchn);
++      } else if (test_bit(irq - PIRQ_BASE, pirq_needs_eoi)) {
++              if (smp_processor_id() != cpu_from_evtchn(evtchn)) {
++                      struct evtchn_unmask unmask = { .port = evtchn };
++                      struct multicall_entry mcl[2];
++
++                      mcl[0].op = __HYPERVISOR_event_channel_op;
++                      mcl[0].args[0] = EVTCHNOP_unmask;
++                      mcl[0].args[1] = (unsigned long)&unmask;
++                      mcl[1].op = __HYPERVISOR_physdev_op;
++                      mcl[1].args[0] = PHYSDEVOP_eoi;
++                      mcl[1].args[1] = (unsigned long)&eoi;
++
++                      if (HYPERVISOR_multicall(mcl, 2))
++                              BUG();
++              } else {
++                      unmask_evtchn(evtchn);
++                      VOID(HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi));
++              }
++      } else
++              unmask_evtchn(evtchn);
++}
++
++static inline void pirq_query_unmask(int irq)
++{
++      struct physdev_irq_status_query irq_status;
++
++      if (pirq_eoi_does_unmask)
++              return;
++      irq_status.irq = evtchn_get_xen_pirq(irq);
++      if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
++              irq_status.flags = 0;
++      clear_bit(irq - PIRQ_BASE, pirq_needs_eoi);
++      if (irq_status.flags & XENIRQSTAT_needs_eoi)
++              set_bit(irq - PIRQ_BASE, pirq_needs_eoi);
++}
++
++static int set_type_pirq(struct irq_data *data, unsigned int type)
++{
++      if (type != IRQ_TYPE_PROBE)
++              return -EINVAL;
++      set_bit(data->irq - PIRQ_BASE, probing_pirq);
++      return 0;
++}
++
++static void enable_pirq(struct irq_data *data)
++{
++      struct evtchn_bind_pirq bind_pirq;
++      unsigned int irq = data->irq;
++      struct irq_cfg *cfg = irq_data_cfg(data);
++      int evtchn = evtchn_from_irq_cfg(cfg);
++      unsigned int pirq = irq - PIRQ_BASE;
++
++      if (VALID_EVTCHN(evtchn)) {
++              if (pirq < nr_pirqs)
++                      clear_bit(pirq, probing_pirq);
++              goto out;
++      }
++
++      bind_pirq.pirq = evtchn_get_xen_pirq(irq);
++      /* NB. We are happy to share unless we are probing. */
++      bind_pirq.flags = (pirq < nr_pirqs
++                         && test_and_clear_bit(pirq, probing_pirq))
++                        || (irq_to_desc(irq)->istate & IRQS_AUTODETECT)
++                        ? 0 : BIND_PIRQ__WILL_SHARE;
++      if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
++              if (bind_pirq.flags)
++                      pr_info("Failed to obtain physical IRQ %d\n", irq);
++              return;
++      }
++      evtchn = bind_pirq.port;
++
++      pirq_query_unmask(irq);
++
++      evtchn_to_irq[evtchn] = irq;
++      bind_evtchn_to_cpu(evtchn, 0);
++      cfg->info = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn);
++
++ out:
++      pirq_unmask_and_notify(evtchn, irq);
++}
++
++#define disable_pirq mask_pirq
++
++static unsigned int startup_pirq(struct irq_data *data)
++{
++      enable_pirq(data);
++      return 0;
++}
++
++static void shutdown_pirq(struct irq_data *data)
++{
++      struct irq_cfg *cfg = irq_data_cfg(data);
++      int evtchn = evtchn_from_irq_cfg(cfg);
++
++      if (!VALID_EVTCHN(evtchn))
++              return;
++
++      mask_evtchn(evtchn);
++
++      if (close_evtchn(evtchn))
++              BUG();
++
++      bind_evtchn_to_cpu(evtchn, 0);
++      evtchn_to_irq[evtchn] = -1;
++      cfg->info = mk_irq_info(IRQT_PIRQ, index_from_irq_cfg(cfg), 0);
++}
++
++static void unmask_pirq(struct irq_data *data)
++{
++      int evtchn = evtchn_from_irq_data(data);
++
++      if (VALID_EVTCHN(evtchn))
++              pirq_unmask_and_notify(evtchn, data->irq);
++}
++
++#define mask_pirq mask_dynirq
++
++static void end_pirq(struct irq_data *data)
++{
++      bool disabled = irqd_irq_disabled(data);
++
++      if (disabled && (irq_to_desc(data->irq)->istate & IRQS_PENDING))
++              shutdown_pirq(data);
++      else {
++              if (!disabled)
++                      irq_move_masked_irq(data);
++              unmask_pirq(data);
++      }
++}
++
++static struct irq_chip pirq_chip = {
++      .name             = "Phys",
++      .irq_startup      = startup_pirq,
++      .irq_shutdown     = shutdown_pirq,
++      .irq_enable       = enable_pirq,
++      .irq_disable      = disable_pirq,
++      .irq_mask         = mask_pirq,
++      .irq_unmask       = unmask_pirq,
++      .irq_eoi          = end_pirq,
++      .irq_set_type     = set_type_pirq,
++#ifdef CONFIG_SMP
++      .irq_set_affinity = set_affinity_irq,
++#endif
++      .irq_retrigger    = resend_irq_on_evtchn,
++};
++
++int irq_ignore_unhandled(unsigned int irq)
++{
++      struct physdev_irq_status_query irq_status = { .irq = irq };
++
++      if (!is_running_on_xen())
++              return 0;
++
++      if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
++              return 0;
++      return !!(irq_status.flags & XENIRQSTAT_shared);
++}
++
++#if defined(CONFIG_SMP) && !defined(PER_CPU_IPI_IRQ)
++void notify_remote_via_ipi(unsigned int ipi, unsigned int cpu)
++{
++      int evtchn = per_cpu(ipi_evtchn, cpu);
++
++#ifdef NMI_VECTOR
++      if (ipi == NMI_VECTOR) {
++              static int __read_mostly printed;
++              int rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, cpu, NULL);
++
++              if (rc && !printed)
++                      pr_warn("Unable (%d) to send NMI to CPU#%u\n",
++                              printed = rc, cpu);
++              return;
++      }
++#endif
++
++      if (VALID_EVTCHN(evtchn)
++          && !test_and_set_bit(ipi, per_cpu(ipi_pending, cpu))
++          && !test_evtchn(evtchn))
++              notify_remote_via_evtchn(evtchn);
++}
++
++void clear_ipi_evtchn(void)
++{
++      int evtchn = percpu_read(ipi_evtchn);
++
++      BUG_ON(!VALID_EVTCHN(evtchn));
++      clear_evtchn(evtchn);
++}
++#endif
++
++void notify_remote_via_irq(int irq)
++{
++      const struct irq_cfg *cfg = irq_cfg(irq);
++      int evtchn;
++
++      if (WARN_ON_ONCE(!cfg))
++              return;
++      BUG_ON(type_from_irq_cfg(cfg) == IRQT_VIRQ);
++      BUG_IF_IPI(cfg);
++
++      evtchn = evtchn_from_irq_cfg(cfg);
++      if (VALID_EVTCHN(evtchn))
++              notify_remote_via_evtchn(evtchn);
++}
++EXPORT_SYMBOL_GPL(notify_remote_via_irq);
++
++#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
++int multi_notify_remote_via_irq(multicall_entry_t *mcl, int irq)
++{
++      const struct irq_cfg *cfg = irq_cfg(irq);
++      int evtchn;
++
++      if (WARN_ON_ONCE(!cfg))
++              return -EINVAL;
++      BUG_ON(type_from_irq_cfg(cfg) == IRQT_VIRQ);
++      BUG_IF_IPI(cfg);
++
++      evtchn = evtchn_from_irq_cfg(cfg);
++      if (!VALID_EVTCHN(evtchn))
++              return -EINVAL;
++
++      multi_notify_remote_via_evtchn(mcl, evtchn);
++      return 0;
++}
++EXPORT_SYMBOL_GPL(multi_notify_remote_via_irq);
++#endif
++
++int irq_to_evtchn_port(int irq)
++{
++      const struct irq_cfg *cfg = irq_cfg(irq);
++
++      if (!cfg)
++              return 0;
++      BUG_IF_VIRQ_PER_CPU(cfg);
++      BUG_IF_IPI(cfg);
++      return evtchn_from_irq_cfg(cfg);
++}
++EXPORT_SYMBOL_GPL(irq_to_evtchn_port);
++
++void mask_evtchn(int port)
++{
++      shared_info_t *s = HYPERVISOR_shared_info;
++      synch_set_bit(port, s->evtchn_mask);
++}
++EXPORT_SYMBOL_GPL(mask_evtchn);
++
++void unmask_evtchn(int port)
++{
++      shared_info_t *s = HYPERVISOR_shared_info;
++      unsigned int cpu = smp_processor_id();
++
++      BUG_ON(!irqs_disabled());
++
++      /* Slow path (hypercall) if this is a non-local port. */
++      if (unlikely(cpu != cpu_from_evtchn(port))) {
++              struct evtchn_unmask unmask = { .port = port };
++              VOID(HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask));
++              return;
++      }
++
++      synch_clear_bit(port, s->evtchn_mask);
++
++      /* Did we miss an interrupt 'edge'? Re-fire if so. */
++      if (synch_test_bit(port, s->evtchn_pending)) {
++              vcpu_info_t *v = current_vcpu_info();
++
++              if (!synch_test_and_set_bit(port / BITS_PER_LONG,
++                                          &v->evtchn_pending_sel))
++                      v->evtchn_upcall_pending = 1;
++      }
++}
++EXPORT_SYMBOL_GPL(unmask_evtchn);
++
++void disable_all_local_evtchn(void)
++{
++      unsigned i, cpu = smp_processor_id();
++      shared_info_t *s = HYPERVISOR_shared_info;
++
++      for (i = 0; i < NR_EVENT_CHANNELS; ++i)
++              if (cpu_from_evtchn(i) == cpu)
++                      synch_set_bit(i, &s->evtchn_mask[0]);
++}
++
++/* Test an irq's pending state. */
++int xen_test_irq_pending(int irq)
++{
++      int evtchn = evtchn_from_irq(irq);
++
++      return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
++}
++
++#ifdef CONFIG_PM_SLEEP
++#include <linux/syscore_ops.h>
++
++static void restore_cpu_virqs(unsigned int cpu)
++{
++      struct evtchn_bind_virq bind_virq;
++      int virq, irq, evtchn;
++
++      for (virq = 0; virq < NR_VIRQS; virq++) {
++              if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
++                      continue;
++
++#ifndef PER_CPU_VIRQ_IRQ
++              if (test_bit(virq, virq_per_cpu)
++                  && !VALID_EVTCHN(per_cpu(virq_to_evtchn, cpu)[virq]))
++                      continue;
++#endif
++
++              BUG_ON(irq_cfg(irq)->info != mk_irq_info(IRQT_VIRQ, virq, 0));
++
++              /* Get a new binding from Xen. */
++              bind_virq.virq = virq;
++              bind_virq.vcpu = cpu;
++              if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
++                                              &bind_virq) != 0)
++                      BUG();
++              evtchn = bind_virq.port;
++
++              /* Record the new mapping. */
++              evtchn_to_irq[evtchn] = irq;
++#ifdef PER_CPU_VIRQ_IRQ
++              irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq, evtchn);
++#else
++              if (test_bit(virq, virq_per_cpu))
++                      per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
++              else {
++                      unsigned int cpu;
++
++                      irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq,
++                                                       evtchn);
++                      for_each_possible_cpu(cpu)
++                              per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
++              }
++#endif
++              bind_evtchn_to_cpu(evtchn, cpu);
++
++              /* Ready for use. */
++              unmask_evtchn(evtchn);
++      }
++}
++
++static void restore_cpu_ipis(unsigned int cpu)
++{
++#ifdef CONFIG_SMP
++      struct evtchn_bind_ipi bind_ipi;
++      struct irq_data *data;
++      int evtchn;
++#ifdef PER_CPU_IPI_IRQ
++      int ipi, irq;
++
++      for (ipi = 0; ipi < NR_IPIS; ipi++) {
++              if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
++                      continue;
++#else
++#define ipi 0
++#define irq ipi_irq
++              if (irq == -1
++                  || !VALID_EVTCHN(per_cpu(ipi_evtchn, cpu)))
++                      return;
++#endif
++
++              data = irq_get_irq_data(irq);
++              BUG_ON(irq_data_cfg(data)->info != mk_irq_info(IRQT_IPI, ipi, 0));
++
++              /* Get a new binding from Xen. */
++              bind_ipi.vcpu = cpu;
++              if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
++                                              &bind_ipi) != 0)
++                      BUG();
++              evtchn = bind_ipi.port;
++
++              /* Record the new mapping. */
++              evtchn_to_irq[evtchn] = irq;
++#ifdef PER_CPU_IPI_IRQ
++              irq_data_cfg(data)->info = mk_irq_info(IRQT_IPI, ipi, evtchn);
++#else
++              per_cpu(ipi_evtchn, cpu) = evtchn;
++#endif
++              bind_evtchn_to_cpu(evtchn, cpu);
++
++              /* Ready for use. */
++              if (!irqd_irq_disabled(data))
++                      unmask_evtchn(evtchn);
++#ifdef PER_CPU_IPI_IRQ
++      }
++#else
++#undef irq
++#undef ipi
++#endif
++#endif /* CONFIG_SMP */
++}
++
++static void evtchn_resume(void)
++{
++      unsigned int cpu, irq, evtchn;
++      struct evtchn_status status;
++
++      /* Avoid doing anything in the 'suspend cancelled' case. */
++      status.dom = DOMID_SELF;
++#ifdef PER_CPU_VIRQ_IRQ
++      status.port = evtchn_from_irq(percpu_read(virq_to_irq[VIRQ_TIMER]));
++#else
++      status.port = percpu_read(virq_to_evtchn[VIRQ_TIMER]);
++#endif
++      if (HYPERVISOR_event_channel_op(EVTCHNOP_status, &status))
++              BUG();
++      if (status.status == EVTCHNSTAT_virq
++          && status.vcpu == smp_processor_id()
++          && status.u.virq == VIRQ_TIMER)
++              return;
++
++      init_evtchn_cpu_bindings();
++
++      if (pirq_eoi_does_unmask) {
++              struct physdev_pirq_eoi_gmfn eoi_gmfn;
++
++              eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
++              if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn))
++                      BUG();
++      }
++
++      /* New event-channel space is not 'live' yet. */
++      for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
++              mask_evtchn(evtchn);
++
++      /* No IRQ <-> event-channel mappings. */
++      for (irq = 0; irq < nr_irqs; irq++) {
++              struct irq_cfg *cfg = irq_cfg(irq);
++
++              if (!cfg)
++                      continue;
++
++              /* Check that no PIRQs are still bound. */
++#ifdef CONFIG_SPARSE_IRQ
++              if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
++                      BUG_ON(type_from_irq_cfg(cfg) == IRQT_PIRQ);
++              else
++#endif
++                      BUG_ON(cfg->info != IRQ_UNBOUND);
++
++              cfg->info &= ~((1U << _EVTCHN_BITS) - 1);
++      }
++      for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
++              evtchn_to_irq[evtchn] = -1;
++
++      for_each_possible_cpu(cpu) {
++              restore_cpu_virqs(cpu);
++              restore_cpu_ipis(cpu);
++      }
++}
++
++static struct syscore_ops evtchn_syscore_ops = {
++      .resume = evtchn_resume,
++};
++
++static int __init evtchn_register(void)
++{
++      if (!is_initial_xendomain())
++              register_syscore_ops(&evtchn_syscore_ops);
++      return 0;
++}
++core_initcall(evtchn_register);
++#endif
++
++int __init arch_early_irq_init(void)
++{
++      unsigned int i;
++
++      for (i = 0; i < ARRAY_SIZE(_irq_cfg); i++)
++              irq_set_chip_data(i, _irq_cfg + i);
++
++      return 0;
++}
++
++struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
++{
++      int res = irq_alloc_desc_at(at, node);
++      struct irq_cfg *cfg = NULL;
++
++      if (res < 0) {
++              if (res != -EEXIST)
++                      return NULL;
++              cfg = irq_get_chip_data(at);
++              if (cfg)
++                      return cfg;
++      }
++
++#ifdef CONFIG_SPARSE_IRQ
++      /* By default all event channels notify CPU#0. */
++      cpumask_copy(irq_get_irq_data(at)->affinity, cpumask_of(0));
++
++      cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
++      if (cfg)
++              irq_set_chip_data(at, cfg);
++      else
++              irq_free_desc(at);
++
++      return cfg;
++#else
++      return irq_cfg(at);
++#endif
++}
++
++#ifdef CONFIG_SPARSE_IRQ
++int nr_pirqs = NR_PIRQS;
++EXPORT_SYMBOL_GPL(nr_pirqs);
++
++int __init arch_probe_nr_irqs(void)
++{
++      int nr = 64 + CONFIG_XEN_NR_GUEST_DEVICES, nr_irqs_gsi;
++
++      if (is_initial_xendomain()) {
++              nr_irqs_gsi = NR_IRQS_LEGACY;
++#ifdef CONFIG_X86_IO_APIC
++              nr_irqs_gsi += gsi_top;
++#endif
++#ifdef CONFIG_PCI_MSI
++              nr += max(nr_irqs_gsi * 16, nr_cpu_ids * 8);
++#endif
++      } else {
++              nr_irqs_gsi = NR_VECTORS;
++#ifdef CONFIG_PCI_MSI
++              nr += max(NR_IRQS_LEGACY * 16, nr_cpu_ids * 8);
++#endif
++      }
++
++      if (nr_pirqs > nr_irqs_gsi)
++              nr_pirqs = nr_irqs_gsi;
++      if (nr > min_t(int, NR_DYNIRQS, NR_EVENT_CHANNELS))
++              nr = min_t(int, NR_DYNIRQS, NR_EVENT_CHANNELS);
++      nr_irqs = min_t(int, nr_pirqs + nr, PAGE_SIZE * 8);
++
++      printk(KERN_DEBUG "nr_pirqs: %d\n", nr_pirqs);
++
++      return ARRAY_SIZE(_irq_cfg);
++}
++#endif
++
++#if defined(CONFIG_X86_IO_APIC)
++int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
++{
++      struct physdev_irq irq_op;
++
++      if (irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs)
++              return -EINVAL;
++
++      if (cfg->vector)
++              return 0;
++
++      irq_op.irq = irq;
++      if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
++              return -ENOSPC;
++
++      cfg->vector = irq_op.vector;
++
++      return 0;
++}
++#define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
++#elif defined(CONFIG_X86)
++#define identity_mapped_irq(irq) (((irq) - PIRQ_BASE) < NR_IRQS_LEGACY)
++#else
++#define identity_mapped_irq(irq) (1)
++#endif
++
++void evtchn_register_pirq(int irq)
++{
++      struct irq_cfg *cfg = irq_cfg(irq);
++
++      BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs);
++      if (identity_mapped_irq(irq) || type_from_irq_cfg(cfg) != IRQT_UNBOUND)
++              return;
++      cfg->info = mk_irq_info(IRQT_PIRQ, irq, 0);
++      irq_set_chip_and_handler_name(irq, &pirq_chip, handle_fasteoi_irq,
++                                    "fasteoi");
++}
++
++#ifdef CONFIG_PCI_MSI
++int evtchn_map_pirq(int irq, int xen_pirq)
++{
++      if (irq < 0) {
++#ifdef CONFIG_SPARSE_IRQ
++              struct irq_cfg *cfg;
++
++              spin_lock(&irq_mapping_update_lock);
++              irq = find_unbound_irq(numa_node_id(), &cfg, &pirq_chip,
++                                     false);
++              if (irq >= 0) {
++                      BUG_ON(type_from_irq_cfg(cfg) != IRQT_UNBOUND);
++                      cfg->bindcount++;
++                      cfg->info = mk_irq_info(IRQT_PIRQ, xen_pirq, 0);
++              }
++              spin_unlock(&irq_mapping_update_lock);
++              if (irq < 0)
++                      return irq;
++      } else if (irq >= PIRQ_BASE && irq < PIRQ_BASE + nr_pirqs) {
++              WARN_ONCE(1, "Non-MSI IRQ#%d (Xen %d)\n", irq, xen_pirq);
++              return -EINVAL;
++#else
++              static DEFINE_SPINLOCK(irq_alloc_lock);
++
++              irq = PIRQ_BASE + nr_pirqs - 1;
++              spin_lock(&irq_alloc_lock);
++              do {
++                      struct irq_cfg *cfg;
++
++                      if (identity_mapped_irq(irq))
++                              continue;
++                      cfg = alloc_irq_and_cfg_at(irq, numa_node_id());
++                      if (unlikely(!cfg)) {
++                              spin_unlock(&irq_alloc_lock);
++                              return -ENOMEM;
++                      }
++                      if (!index_from_irq_cfg(cfg)) {
++                              BUG_ON(type_from_irq_cfg(cfg) != IRQT_UNBOUND);
++                              cfg->info = mk_irq_info(IRQT_PIRQ,
++                                                      xen_pirq, 0);
++                              break;
++                      }
++              } while (--irq >= PIRQ_BASE);
++              spin_unlock(&irq_alloc_lock);
++              if (irq < PIRQ_BASE)
++                      return -ENOSPC;
++              irq_set_chip_and_handler_name(irq, &pirq_chip,
++                                            handle_fasteoi_irq, "fasteoi");
++#endif
++      } else if (!xen_pirq) {
++              struct irq_cfg *cfg = irq_cfg(irq);
++
++              if (!cfg || unlikely(type_from_irq_cfg(cfg) != IRQT_PIRQ))
++                      return -EINVAL;
++              /*
++               * dynamic_irq_cleanup(irq) would seem to be the correct thing
++               * here, but cannot be used as we get here also during shutdown
++               * when a driver didn't free_irq() its MSI(-X) IRQ(s), which
++               * then causes a warning in dynamic_irq_cleanup().
++               */
++              irq_set_chip_and_handler(irq, NULL, NULL);
++              cfg->info = IRQ_UNBOUND;
++#ifdef CONFIG_SPARSE_IRQ
++              cfg->bindcount--;
++#endif
++              return 0;
++      } else if (type_from_irq(irq) != IRQT_PIRQ
++                 || index_from_irq(irq) != xen_pirq) {
++              pr_err("IRQ#%d is already mapped to %d:%u - "
++                     "cannot map to PIRQ#%u\n",
++                     irq, type_from_irq(irq), index_from_irq(irq), xen_pirq);
++              return -EINVAL;
++      }
++      return index_from_irq(irq) ? irq : -EINVAL;
++}
++#endif
++
++int evtchn_get_xen_pirq(int irq)
++{
++      struct irq_cfg *cfg = irq_cfg(irq);
++
++      if (identity_mapped_irq(irq))
++              return irq;
++      BUG_ON(type_from_irq_cfg(cfg) != IRQT_PIRQ);
++      return index_from_irq_cfg(cfg);
++}
++
++void __init xen_init_IRQ(void)
++{
++      unsigned int i;
++      struct physdev_pirq_eoi_gmfn eoi_gmfn;
++
++#ifndef PER_CPU_VIRQ_IRQ
++      __set_bit(VIRQ_TIMER, virq_per_cpu);
++      __set_bit(VIRQ_DEBUG, virq_per_cpu);
++      __set_bit(VIRQ_XENOPROF, virq_per_cpu);
++#ifdef CONFIG_IA64
++      __set_bit(VIRQ_ITC, virq_per_cpu);
++#endif
++#endif
++
++      init_evtchn_cpu_bindings();
++
++#ifdef CONFIG_SPARSE_IRQ
++      i = nr_irqs;
++#else
++      i = nr_pirqs;
++#endif
++      i = get_order(sizeof(unsigned long) * BITS_TO_LONGS(i));
++      pirq_needs_eoi = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, i);
++      BUILD_BUG_ON(NR_PIRQS > PAGE_SIZE * 8);
++      eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
++      if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
++              pirq_eoi_does_unmask = true;
++
++      /* No event channels are 'live' right now. */
++      for (i = 0; i < NR_EVENT_CHANNELS; i++)
++              mask_evtchn(i);
++
++#ifndef CONFIG_SPARSE_IRQ
++      for (i = DYNIRQ_BASE; i < (DYNIRQ_BASE + NR_DYNIRQS); i++) {
++              irq_set_noprobe(i);
++              irq_set_chip_and_handler_name(i, &dynirq_chip,
++                                            handle_fasteoi_irq, "fasteoi");
++      }
++
++      for (i = PIRQ_BASE; i < (PIRQ_BASE + nr_pirqs); i++) {
++#else
++      for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_IRQS_LEGACY); i++) {
++#endif
++              if (!identity_mapped_irq(i))
++                      continue;
++
++#ifdef RTC_IRQ
++              /* If not domain 0, force our RTC driver to fail its probe. */
++              if (i - PIRQ_BASE == RTC_IRQ && !is_initial_xendomain())
++                      continue;
++#endif
++
++              irq_set_chip_and_handler_name(i, &pirq_chip,
++                                            handle_fasteoi_irq, "fasteoi");
++      }
++}
diff --cc drivers/xen/core/firmware.c

index 0000000,0000000..2f851ee

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/firmware.c
@@@ -1,0 -1,0 +1,75 @@@
++#include <linux/kernel.h>
++#include <linux/string.h>
++#include <linux/errno.h>
++#include <linux/init.h>
++#include <linux/edd.h>
++#include <video/edid.h>
++#include <xen/interface/platform.h>
++#include <asm/hypervisor.h>
++
++#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
++void __init copy_edd(void)
++{
++      int ret;
++      struct xen_platform_op op;
++
++      if (!is_initial_xendomain())
++              return;
++
++      op.cmd = XENPF_firmware_info;
++
++      op.u.firmware_info.type = XEN_FW_DISK_INFO;
++      for (op.u.firmware_info.index = 0;
++           edd.edd_info_nr < EDDMAXNR;
++           op.u.firmware_info.index++) {
++              struct edd_info *info = edd.edd_info + edd.edd_info_nr;
++
++              info->params.length = sizeof(info->params);
++              set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
++                                   &info->params);
++              ret = HYPERVISOR_platform_op(&op);
++              if (ret)
++                      break;
++
++#define C(x) info->x = op.u.firmware_info.u.disk_info.x
++              C(device);
++              C(version);
++              C(interface_support);
++              C(legacy_max_cylinder);
++              C(legacy_max_head);
++              C(legacy_sectors_per_track);
++#undef C
++
++              edd.edd_info_nr++;
++      }
++
++      op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
++      for (op.u.firmware_info.index = 0;
++           edd.mbr_signature_nr < EDD_MBR_SIG_MAX;
++           op.u.firmware_info.index++) {
++              ret = HYPERVISOR_platform_op(&op);
++              if (ret)
++                      break;
++              edd.mbr_signature[edd.mbr_signature_nr++] =
++                      op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
++      }
++}
++#endif
++
++void __init copy_edid(void)
++{
++#if defined(CONFIG_FIRMWARE_EDID) && defined(CONFIG_X86)
++      struct xen_platform_op op;
++
++      if (!is_initial_xendomain())
++              return;
++
++      op.cmd = XENPF_firmware_info;
++      op.u.firmware_info.index = 0;
++      op.u.firmware_info.type = XEN_FW_VBEDDC_INFO;
++      set_xen_guest_handle(op.u.firmware_info.u.vbeddc_info.edid,
++                           edid_info.dummy);
++      if (HYPERVISOR_platform_op(&op) != 0)
++              memset(edid_info.dummy, 0x13, sizeof(edid_info.dummy));
++#endif
++}
diff --cc drivers/xen/core/gnttab.c

index 0000000,0000000..8cc030f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/gnttab.c
@@@ -1,0 -1,0 +1,890 @@@
++/******************************************************************************
++ * gnttab.c
++ *
++ * Granting foreign access to our memory reservation.
++ *
++ * Copyright (c) 2005-2006, Christopher Clark
++ * Copyright (c) 2004-2005, K A Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/seqlock.h>
++#include <xen/interface/xen.h>
++#include <xen/gnttab.h>
++#include <asm/pgtable.h>
++#include <asm/uaccess.h>
++#include <asm/synch_bitops.h>
++#include <asm/io.h>
++#include <xen/interface/memory.h>
++#include <asm/gnttab_dma.h>
++
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
++/* External tools reserve first few grant table entries. */
++#define NR_RESERVED_ENTRIES 8
++#define GNTTAB_LIST_END 0xffffffff
++#define ENTRIES_PER_GRANT_FRAME (PAGE_SIZE / sizeof(grant_entry_t))
++
++static grant_ref_t **gnttab_list;
++static unsigned int nr_grant_frames;
++static unsigned int boot_max_nr_grant_frames;
++static int gnttab_free_count;
++static grant_ref_t gnttab_free_head;
++static DEFINE_SPINLOCK(gnttab_list_lock);
++
++static struct grant_entry *shared;
++
++static struct gnttab_free_callback *gnttab_free_callback_list;
++
++static int gnttab_expand(unsigned int req_entries);
++
++#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
++#define gnttab_entry(entry) (gnttab_list[(entry) / RPP][(entry) % RPP])
++
++#define nr_freelist_frames(grant_frames)                              \
++      (((grant_frames) * ENTRIES_PER_GRANT_FRAME + RPP - 1) / RPP)
++
++static int get_free_entries(int count)
++{
++      unsigned long flags;
++      int ref, rc;
++      grant_ref_t head;
++
++      spin_lock_irqsave(&gnttab_list_lock, flags);
++
++      if ((gnttab_free_count < count) &&
++          ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) {
++              spin_unlock_irqrestore(&gnttab_list_lock, flags);
++              return rc;
++      }
++
++      ref = head = gnttab_free_head;
++      gnttab_free_count -= count;
++      while (count-- > 1)
++              head = gnttab_entry(head);
++      gnttab_free_head = gnttab_entry(head);
++      gnttab_entry(head) = GNTTAB_LIST_END;
++
++      spin_unlock_irqrestore(&gnttab_list_lock, flags);
++
++      return ref;
++}
++
++#define get_free_entry() get_free_entries(1)
++
++static void do_free_callbacks(void)
++{
++      struct gnttab_free_callback *callback, *next;
++
++      callback = gnttab_free_callback_list;
++      gnttab_free_callback_list = NULL;
++
++      while (callback != NULL) {
++              next = callback->next;
++              if (gnttab_free_count >= callback->count) {
++                      callback->next = NULL;
++                      callback->queued = 0;
++                      callback->fn(callback->arg);
++              } else {
++                      callback->next = gnttab_free_callback_list;
++                      gnttab_free_callback_list = callback;
++              }
++              callback = next;
++      }
++}
++
++static inline void check_free_callbacks(void)
++{
++      if (unlikely(gnttab_free_callback_list))
++              do_free_callbacks();
++}
++
++static void put_free_entry(grant_ref_t ref)
++{
++      unsigned long flags;
++      spin_lock_irqsave(&gnttab_list_lock, flags);
++      gnttab_entry(ref) = gnttab_free_head;
++      gnttab_free_head = ref;
++      gnttab_free_count++;
++      check_free_callbacks();
++      spin_unlock_irqrestore(&gnttab_list_lock, flags);
++}
++
++/*
++ * Public grant-issuing interface functions
++ */
++
++int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
++                              int flags)
++{
++      int ref;
++
++      if (unlikely((ref = get_free_entry()) < 0))
++              return -ENOSPC;
++
++      shared[ref].frame = frame;
++      shared[ref].domid = domid;
++      wmb();
++      BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing));
++      shared[ref].flags = GTF_permit_access | flags;
++
++      return ref;
++}
++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
++
++void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
++                                   unsigned long frame, int flags)
++{
++      shared[ref].frame = frame;
++      shared[ref].domid = domid;
++      wmb();
++      BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing));
++      shared[ref].flags = GTF_permit_access | flags;
++}
++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
++
++
++int gnttab_query_foreign_access(grant_ref_t ref)
++{
++      u16 nflags;
++
++      nflags = shared[ref].flags;
++
++      return (nflags & (GTF_reading|GTF_writing));
++}
++EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
++
++int gnttab_end_foreign_access_ref(grant_ref_t ref)
++{
++      u16 flags, nflags;
++
++      nflags = shared[ref].flags;
++      do {
++              if ((flags = nflags) & (GTF_reading|GTF_writing)) {
++                      printk(KERN_DEBUG "WARNING: g.e. still in use!\n");
++                      return 0;
++              }
++      } while ((nflags = synch_cmpxchg_subword(&shared[ref].flags, flags, 0)) !=
++               flags);
++
++      return 1;
++}
++EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
++
++void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page)
++{
++      if (gnttab_end_foreign_access_ref(ref)) {
++              put_free_entry(ref);
++              if (page != 0)
++                      free_page(page);
++      } else {
++              /* XXX This needs to be fixed so that the ref and page are
++                 placed on a list to be freed up later. */
++              printk(KERN_DEBUG
++                     "WARNING: leaking g.e. and page still in use!\n");
++      }
++}
++EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
++
++int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
++{
++      int ref;
++
++      if (unlikely((ref = get_free_entry()) < 0))
++              return -ENOSPC;
++      gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
++
++      return ref;
++}
++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
++
++void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
++                                     unsigned long pfn)
++{
++      shared[ref].frame = pfn;
++      shared[ref].domid = domid;
++      wmb();
++      shared[ref].flags = GTF_accept_transfer;
++}
++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
++
++unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
++{
++      unsigned long frame;
++      u16           flags;
++
++      /*
++       * If a transfer is not even yet started, try to reclaim the grant
++       * reference and return failure (== 0).
++       */
++      while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
++              if (synch_cmpxchg_subword(&shared[ref].flags, flags, 0) == flags)
++                      return 0;
++              cpu_relax();
++      }
++
++      /* If a transfer is in progress then wait until it is completed. */
++      while (!(flags & GTF_transfer_completed)) {
++              flags = shared[ref].flags;
++              cpu_relax();
++      }
++
++      /* Read the frame number /after/ reading completion status. */
++      rmb();
++      frame = shared[ref].frame;
++      BUG_ON(frame == 0);
++
++      return frame;
++}
++EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
++
++unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
++{
++      unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
++      put_free_entry(ref);
++      return frame;
++}
++EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
++
++void gnttab_free_grant_reference(grant_ref_t ref)
++{
++      put_free_entry(ref);
++}
++EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
++
++void gnttab_free_grant_references(grant_ref_t head)
++{
++      grant_ref_t ref;
++      unsigned long flags;
++      int count = 1;
++      if (head == GNTTAB_LIST_END)
++              return;
++      spin_lock_irqsave(&gnttab_list_lock, flags);
++      ref = head;
++      while (gnttab_entry(ref) != GNTTAB_LIST_END) {
++              ref = gnttab_entry(ref);
++              count++;
++      }
++      gnttab_entry(ref) = gnttab_free_head;
++      gnttab_free_head = head;
++      gnttab_free_count += count;
++      check_free_callbacks();
++      spin_unlock_irqrestore(&gnttab_list_lock, flags);
++}
++EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
++
++int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
++{
++      int h = get_free_entries(count);
++
++      if (h < 0)
++              return -ENOSPC;
++
++      *head = h;
++
++      return 0;
++}
++EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
++
++int gnttab_empty_grant_references(const grant_ref_t *private_head)
++{
++      return (*private_head == GNTTAB_LIST_END);
++}
++EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
++
++int gnttab_claim_grant_reference(grant_ref_t *private_head)
++{
++      grant_ref_t g = *private_head;
++      if (unlikely(g == GNTTAB_LIST_END))
++              return -ENOSPC;
++      *private_head = gnttab_entry(g);
++      return g;
++}
++EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
++
++void gnttab_release_grant_reference(grant_ref_t *private_head,
++                                  grant_ref_t release)
++{
++      gnttab_entry(release) = *private_head;
++      *private_head = release;
++}
++EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
++
++void gnttab_request_free_callback(struct gnttab_free_callback *callback,
++                                void (*fn)(void *), void *arg, u16 count)
++{
++      unsigned long flags;
++      spin_lock_irqsave(&gnttab_list_lock, flags);
++      if (callback->queued)
++              goto out;
++      callback->fn = fn;
++      callback->arg = arg;
++      callback->count = count;
++      callback->queued = 1;
++      callback->next = gnttab_free_callback_list;
++      gnttab_free_callback_list = callback;
++      check_free_callbacks();
++out:
++      spin_unlock_irqrestore(&gnttab_list_lock, flags);
++}
++EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
++
++void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
++{
++      struct gnttab_free_callback **pcb;
++      unsigned long flags;
++
++      spin_lock_irqsave(&gnttab_list_lock, flags);
++      for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
++              if (*pcb == callback) {
++                      *pcb = callback->next;
++                      callback->queued = 0;
++                      break;
++              }
++      }
++      spin_unlock_irqrestore(&gnttab_list_lock, flags);
++}
++EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
++
++static int grow_gnttab_list(unsigned int more_frames)
++{
++      unsigned int new_nr_grant_frames, extra_entries, i;
++      unsigned int nr_glist_frames, new_nr_glist_frames;
++
++      new_nr_grant_frames = nr_grant_frames + more_frames;
++      extra_entries       = more_frames * ENTRIES_PER_GRANT_FRAME;
++
++      nr_glist_frames = nr_freelist_frames(nr_grant_frames);
++      new_nr_glist_frames = nr_freelist_frames(new_nr_grant_frames);
++      for (i = nr_glist_frames; i < new_nr_glist_frames; i++) {
++              gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
++              if (!gnttab_list[i])
++                      goto grow_nomem;
++      }
++
++      for (i = ENTRIES_PER_GRANT_FRAME * nr_grant_frames;
++           i < ENTRIES_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++)
++              gnttab_entry(i) = i + 1;
++
++      gnttab_entry(i) = gnttab_free_head;
++      gnttab_free_head = ENTRIES_PER_GRANT_FRAME * nr_grant_frames;
++      gnttab_free_count += extra_entries;
++
++      nr_grant_frames = new_nr_grant_frames;
++
++      check_free_callbacks();
++
++      return 0;
++      
++grow_nomem:
++      for ( ; i >= nr_glist_frames; i--)
++              free_page((unsigned long) gnttab_list[i]);
++      return -ENOMEM;
++}
++
++static unsigned int __max_nr_grant_frames(void)
++{
++      struct gnttab_query_size query;
++      int rc;
++
++      query.dom = DOMID_SELF;
++
++      rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1);
++      if ((rc < 0) || (query.status != GNTST_okay))
++              return 4; /* Legacy max supported number of frames */
++
++      return query.max_nr_frames;
++}
++
++static inline unsigned int max_nr_grant_frames(void)
++{
++      unsigned int xen_max = __max_nr_grant_frames();
++
++      if (xen_max > boot_max_nr_grant_frames)
++              return boot_max_nr_grant_frames;
++      return xen_max;
++}
++
++#ifdef CONFIG_XEN
++
++#ifdef CONFIG_X86
++static int map_pte_fn(pte_t *pte, struct page *pmd_page,
++                    unsigned long addr, void *data)
++{
++      unsigned long **frames = (unsigned long **)data;
++
++      set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL));
++      (*frames)++;
++      return 0;
++}
++
++#ifdef CONFIG_PM_SLEEP
++static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
++                      unsigned long addr, void *data)
++{
++
++      set_pte_at(&init_mm, addr, pte, __pte(0));
++      return 0;
++}
++#endif
++
++void *arch_gnttab_alloc_shared(unsigned long *frames)
++{
++      struct vm_struct *area;
++      area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames());
++      BUG_ON(area == NULL);
++      return area->addr;
++}
++#endif /* CONFIG_X86 */
++
++static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
++{
++      struct gnttab_setup_table setup;
++      unsigned long *frames;
++      unsigned int nr_gframes = end_idx + 1;
++      int rc;
++
++      frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
++      if (!frames)
++              return -ENOMEM;
++
++      setup.dom        = DOMID_SELF;
++      setup.nr_frames  = nr_gframes;
++      set_xen_guest_handle(setup.frame_list, frames);
++
++      rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
++      if (rc == -ENOSYS) {
++              kfree(frames);
++              return -ENOSYS;
++      }
++
++      BUG_ON(rc || setup.status != GNTST_okay);
++
++      if (shared == NULL)
++              shared = arch_gnttab_alloc_shared(frames);
++
++#ifdef CONFIG_X86
++      rc = apply_to_page_range(&init_mm, (unsigned long)shared,
++                               PAGE_SIZE * nr_gframes,
++                               map_pte_fn, &frames);
++      BUG_ON(rc);
++      frames -= nr_gframes; /* adjust after map_pte_fn() */
++#endif /* CONFIG_X86 */
++
++      kfree(frames);
++
++      return 0;
++}
++
++#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
++
++static DEFINE_SEQLOCK(gnttab_dma_lock);
++
++static void gnttab_page_free(struct page *page, unsigned int order)
++{
++      BUG_ON(order);
++      ClearPageForeign(page);
++      gnttab_reset_grant_page(page);
++      ClearPageReserved(page);
++      put_page(page);
++}
++
++/*
++ * Must not be called with IRQs off.  This should only be used on the
++ * slow path.
++ *
++ * Copy a foreign granted page to local memory.
++ */
++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
++{
++      struct gnttab_unmap_and_replace unmap;
++      mmu_update_t mmu;
++      struct page *page;
++      struct page *new_page;
++      void *new_addr;
++      void *addr;
++      paddr_t pfn;
++      maddr_t mfn;
++      maddr_t new_mfn;
++      int err;
++
++      page = *pagep;
++      if (!get_page_unless_zero(page))
++              return -ENOENT;
++
++      err = -ENOMEM;
++      new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
++      if (!new_page)
++              goto out;
++
++      new_addr = page_address(new_page);
++      addr = page_address(page);
++      copy_page(new_addr, addr);
++
++      pfn = page_to_pfn(page);
++      mfn = pfn_to_mfn(pfn);
++      new_mfn = virt_to_mfn(new_addr);
++
++      write_seqlock_bh(&gnttab_dma_lock);
++
++      /* Make seq visible before checking page_mapped. */
++      smp_mb();
++
++      /* Has the page been DMA-mapped? */
++      if (unlikely(page_mapped(page))) {
++              write_sequnlock_bh(&gnttab_dma_lock);
++              put_page(new_page);
++              err = -EBUSY;
++              goto out;
++      }
++
++      if (!xen_feature(XENFEAT_auto_translated_physmap))
++              set_phys_to_machine(pfn, new_mfn);
++
++      gnttab_set_replace_op(&unmap, (unsigned long)addr,
++                            (unsigned long)new_addr, ref);
++
++      err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
++                                      &unmap, 1);
++      BUG_ON(err);
++      BUG_ON(unmap.status != GNTST_okay);
++
++      write_sequnlock_bh(&gnttab_dma_lock);
++
++      if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++              set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
++
++              mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
++              mmu.val = pfn;
++              err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF);
++              BUG_ON(err);
++      }
++
++      new_page->mapping = page->mapping;
++      new_page->index = page->index;
++      set_bit(PG_foreign, &new_page->flags);
++      if (PageReserved(page))
++              SetPageReserved(new_page);
++      *pagep = new_page;
++
++      SetPageForeign(page, gnttab_page_free);
++      page->mapping = NULL;
++
++out:
++      put_page(page);
++      return err;
++}
++EXPORT_SYMBOL_GPL(gnttab_copy_grant_page);
++
++void gnttab_reset_grant_page(struct page *page)
++{
++      init_page_count(page);
++      reset_page_mapcount(page);
++}
++EXPORT_SYMBOL_GPL(gnttab_reset_grant_page);
++
++/*
++ * Keep track of foreign pages marked as PageForeign so that we don't
++ * return them to the remote domain prematurely.
++ *
++ * PageForeign pages are pinned down by increasing their mapcount.
++ *
++ * All other pages are simply returned as is.
++ */
++void __gnttab_dma_map_page(struct page *page)
++{
++      unsigned int seq;
++
++      if (!is_running_on_xen() || !PageForeign(page))
++              return;
++
++      do {
++              seq = read_seqbegin(&gnttab_dma_lock);
++
++              if (gnttab_dma_local_pfn(page))
++                      break;
++
++              atomic_set(&page->_mapcount, 0);
++
++              /* Make _mapcount visible before read_seqretry. */
++              smp_mb();
++      } while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
++}
++
++#endif /* CONFIG_XEN_BACKEND */
++
++#ifdef __HAVE_ARCH_PTE_SPECIAL
++
++static unsigned int GNTMAP_pte_special;
++
++bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
++                         unsigned int count)
++{
++      unsigned int i;
++
++      if (unlikely(cmd != GNTTABOP_map_grant_ref))
++              count = 0;
++
++      for (i = 0; i < count; ++i, ++map) {
++              if (!(map->flags & GNTMAP_host_map)
++                  || !(map->flags & GNTMAP_application_map))
++                      continue;
++              if (GNTMAP_pte_special)
++                      map->flags |= GNTMAP_pte_special;
++              else {
++                      BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++                      return true;
++              }
++      }
++
++      return false;
++}
++EXPORT_SYMBOL(gnttab_pre_map_adjust);
++
++#if CONFIG_XEN_COMPAT < 0x030400
++int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
++{
++      unsigned int i;
++      int rc = 0;
++
++      for (i = 0; i < count && rc == 0; ++i, ++map) {
++              pte_t pte;
++
++              if (!(map->flags & GNTMAP_host_map)
++                  || !(map->flags & GNTMAP_application_map))
++                      continue;
++
++#ifdef CONFIG_X86
++              pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
++                              | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
++                              | _PAGE_SPECIAL)
++                             & __supported_pte_mask);
++#else
++#error Architecture not yet supported.
++#endif
++              if (!(map->flags & GNTMAP_readonly))
++                      pte = pte_mkwrite(pte);
++
++              if (map->flags & GNTMAP_contains_pte) {
++                      mmu_update_t u;
++
++                      u.ptr = map->host_addr;
++                      u.val = __pte_val(pte);
++                      rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
++              } else
++                      rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
++      }
++
++      return rc;
++}
++EXPORT_SYMBOL(gnttab_post_map_adjust);
++#endif
++
++#endif /* __HAVE_ARCH_PTE_SPECIAL */
++
++int gnttab_resume(void)
++{
++      if (max_nr_grant_frames() < nr_grant_frames)
++              return 0;
++      return gnttab_map(0, nr_grant_frames - 1);
++}
++
++#ifdef CONFIG_PM_SLEEP
++#include <linux/syscore_ops.h>
++
++#ifdef CONFIG_X86
++static int gnttab_suspend(void)
++{
++      apply_to_page_range(&init_mm, (unsigned long)shared,
++                          PAGE_SIZE * nr_grant_frames,
++                          unmap_pte_fn, NULL);
++      return 0;
++}
++#else
++#define gnttab_suspend NULL
++#endif
++
++static void _gnttab_resume(void)
++{
++      if (gnttab_resume())
++              BUG();
++}
++
++static struct syscore_ops gnttab_syscore_ops = {
++      .resume         = _gnttab_resume,
++      .suspend        = gnttab_suspend,
++};
++#endif
++
++#else /* !CONFIG_XEN */
++
++#include <platform-pci.h>
++
++static unsigned long resume_frames;
++
++static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
++{
++      struct xen_add_to_physmap xatp;
++      unsigned int i = end_idx;
++
++      /* Loop backwards, so that the first hypercall has the largest index,
++       * ensuring that the table will grow only once.
++       */
++      do {
++              xatp.domid = DOMID_SELF;
++              xatp.idx = i;
++              xatp.space = XENMAPSPACE_grant_table;
++              xatp.gpfn = (resume_frames >> PAGE_SHIFT) + i;
++              if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
++                      BUG();
++      } while (i-- > start_idx);
++
++      return 0;
++}
++
++int gnttab_resume(void)
++{
++      unsigned int max_nr_gframes, nr_gframes;
++
++      nr_gframes = nr_grant_frames;
++      max_nr_gframes = max_nr_grant_frames();
++      if (max_nr_gframes < nr_gframes)
++              return -ENOSYS;
++
++      if (!resume_frames) {
++              resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
++              shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes);
++              if (shared == NULL) {
++                      pr_warning("error to ioremap gnttab share frames\n");
++                      return -1;
++              }
++      }
++
++      gnttab_map(0, nr_gframes - 1);
++
++      return 0;
++}
++
++#endif /* !CONFIG_XEN */
++
++static int gnttab_expand(unsigned int req_entries)
++{
++      int rc;
++      unsigned int cur, extra;
++
++      cur = nr_grant_frames;
++      extra = ((req_entries + (ENTRIES_PER_GRANT_FRAME-1)) /
++               ENTRIES_PER_GRANT_FRAME);
++      if (cur + extra > max_nr_grant_frames())
++              return -ENOSPC;
++
++      if ((rc = gnttab_map(cur, cur + extra - 1)) == 0)
++              rc = grow_gnttab_list(extra);
++
++      return rc;
++}
++
++#ifdef CONFIG_XEN
++static int __init
++#else
++int __devinit
++#endif
++gnttab_init(void)
++{
++      int i;
++      unsigned int max_nr_glist_frames, nr_glist_frames;
++      unsigned int nr_init_grefs;
++
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      nr_grant_frames = 1;
++      boot_max_nr_grant_frames = __max_nr_grant_frames();
++
++      /* Determine the maximum number of frames required for the
++       * grant reference free list on the current hypervisor.
++       */
++      max_nr_glist_frames = nr_freelist_frames(boot_max_nr_grant_frames);
++
++      gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
++                            GFP_KERNEL);
++      if (gnttab_list == NULL)
++              return -ENOMEM;
++
++      nr_glist_frames = nr_freelist_frames(nr_grant_frames);
++      for (i = 0; i < nr_glist_frames; i++) {
++              gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
++              if (gnttab_list[i] == NULL)
++                      goto ini_nomem;
++      }
++
++      if (gnttab_resume() < 0)
++              return -ENODEV;
++
++      nr_init_grefs = nr_grant_frames * ENTRIES_PER_GRANT_FRAME;
++
++      for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
++              gnttab_entry(i) = i + 1;
++
++      gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END;
++      gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
++      gnttab_free_head  = NR_RESERVED_ENTRIES;
++
++#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
++      if (!xen_feature(XENFEAT_auto_translated_physmap)
++          && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
++#ifdef CONFIG_X86
++              GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
++                                    >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
++#else
++#error Architecture not yet supported.
++#endif
++      }
++#endif
++
++#if defined(CONFIG_XEN) && defined(CONFIG_PM_SLEEP)
++      if (!is_initial_xendomain())
++              register_syscore_ops(&gnttab_syscore_ops);
++#endif
++
++      return 0;
++
++ ini_nomem:
++      for (i--; i >= 0; i--)
++              free_page((unsigned long)gnttab_list[i]);
++      kfree(gnttab_list);
++      return -ENOMEM;
++}
++
++#ifdef CONFIG_XEN
++core_initcall(gnttab_init);
++#endif
diff --cc drivers/xen/core/hypervisor_sysfs.c

index 0000000,0000000..c0c492d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/hypervisor_sysfs.c
@@@ -1,0 -1,0 +1,57 @@@
++/*
++ *  copyright (c) 2006 IBM Corporation
++ *  Authored by: Mike D. Day <ncmike@us.ibm.com>
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License version 2 as
++ *  published by the Free Software Foundation.
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/kobject.h>
++#include <xen/hypervisor_sysfs.h>
++#include <asm/hypervisor.h>
++
++static ssize_t hyp_sysfs_show(struct kobject *kobj,
++                            struct attribute *attr,
++                            char *buffer)
++{
++      struct hyp_sysfs_attr *hyp_attr;
++      hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
++      if (hyp_attr->show)
++              return hyp_attr->show(hyp_attr, buffer);
++      return 0;
++}
++
++static ssize_t hyp_sysfs_store(struct kobject *kobj,
++                             struct attribute *attr,
++                             const char *buffer,
++                             size_t len)
++{
++      struct hyp_sysfs_attr *hyp_attr;
++      hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
++      if (hyp_attr->store)
++              return hyp_attr->store(hyp_attr, buffer, len);
++      return 0;
++}
++
++static const struct sysfs_ops hyp_sysfs_ops = {
++      .show = hyp_sysfs_show,
++      .store = hyp_sysfs_store,
++};
++
++static struct kobj_type hyp_sysfs_kobj_type = {
++      .sysfs_ops = &hyp_sysfs_ops,
++};
++
++static int __init hypervisor_subsys_init(void)
++{
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
++      return 0;
++}
++
++device_initcall(hypervisor_subsys_init);
diff --cc drivers/xen/core/machine_kexec.c

index 0000000,0000000..a8ea9cd

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/machine_kexec.c
@@@ -1,0 -1,0 +1,267 @@@
++/*
++ * drivers/xen/core/machine_kexec.c 
++ * handle transition of Linux booting another kernel
++ */
++
++#include <linux/kexec.h>
++#include <xen/interface/kexec.h>
++#include <linux/reboot.h>
++#include <linux/mm.h>
++#include <linux/bootmem.h>
++
++extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, 
++                                       struct kimage *image);
++extern int machine_kexec_setup_resources(struct resource *hypervisor,
++                                       struct resource *phys_cpus,
++                                       int nr_phys_cpus);
++extern void machine_kexec_register_resources(struct resource *res);
++
++static int __initdata xen_max_nr_phys_cpus;
++static struct resource xen_hypervisor_res;
++static struct resource *xen_phys_cpus;
++
++size_t vmcoreinfo_size_xen;
++unsigned long paddr_vmcoreinfo_xen;
++
++void __init xen_machine_kexec_setup_resources(void)
++{
++      xen_kexec_range_t range;
++      struct resource *res;
++      int k = 0;
++      int rc;
++
++      if (strstr(boot_command_line, "crashkernel="))
++              pr_warning("Ignoring crashkernel command line, "
++                         "parameter will be supplied by xen\n");
++
++      if (!is_initial_xendomain())
++              return;
++
++      /* determine maximum number of physical cpus */
++
++      while (1) {
++              memset(&range, 0, sizeof(range));
++              range.range = KEXEC_RANGE_MA_CPU;
++              range.nr = k;
++
++              if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
++                      break;
++
++              k++;
++      }
++
++      if (k == 0)
++              return;
++
++      xen_max_nr_phys_cpus = k;
++
++      /* allocate xen_phys_cpus */
++
++      xen_phys_cpus = alloc_bootmem(k * sizeof(struct resource));
++
++      /* fill in xen_phys_cpus with per-cpu crash note information */
++
++      for (k = 0; k < xen_max_nr_phys_cpus; k++) {
++              memset(&range, 0, sizeof(range));
++              range.range = KEXEC_RANGE_MA_CPU;
++              range.nr = k;
++
++              if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
++                      goto err;
++
++              res = xen_phys_cpus + k;
++
++              memset(res, 0, sizeof(*res));
++              res->name = "Crash note";
++              res->start = range.start;
++              res->end = range.start + range.size - 1;
++              res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
++      }
++
++      /* fill in xen_hypervisor_res with hypervisor machine address range */
++
++      memset(&range, 0, sizeof(range));
++      range.range = KEXEC_RANGE_MA_XEN;
++
++      if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
++              goto err;
++
++      xen_hypervisor_res.name = "Hypervisor code and data";
++      xen_hypervisor_res.start = range.start;
++      xen_hypervisor_res.end = range.start + range.size - 1;
++      xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
++#ifdef CONFIG_X86
++      insert_resource(&iomem_resource, &xen_hypervisor_res);
++#endif
++
++      /* fill in crashk_res if range is reserved by hypervisor */
++
++      memset(&range, 0, sizeof(range));
++      range.range = KEXEC_RANGE_MA_CRASH;
++
++      if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
++              goto err;
++
++      if (range.size) {
++              crashk_res.start = range.start;
++              crashk_res.end = range.start + range.size - 1;
++#ifdef CONFIG_X86
++              insert_resource(&iomem_resource, &crashk_res);
++#endif
++      }
++
++      /* get physical address of vmcoreinfo */
++      memset(&range, 0, sizeof(range));
++      range.range = KEXEC_RANGE_MA_VMCOREINFO;
++
++      rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range);
++
++      if (rc == 0) {
++              /* Hypercall succeeded */
++              vmcoreinfo_size_xen = range.size;
++              paddr_vmcoreinfo_xen = range.start;
++
++      } else {
++              /* Hypercall failed.
++               * Indicate not to create sysfs file by resetting globals
++               */
++              vmcoreinfo_size_xen = 0;
++              paddr_vmcoreinfo_xen = 0;
++              
++              /* The KEXEC_CMD_kexec_get_range hypercall did not implement
++               * KEXEC_RANGE_MA_VMCOREINFO until Xen 3.3.
++               * Do not bail out if it fails for this reason.
++               */
++              if (rc != -EINVAL)
++                      return;
++      }
++
++      if (machine_kexec_setup_resources(&xen_hypervisor_res, xen_phys_cpus,
++                                        xen_max_nr_phys_cpus))
++              goto err;
++
++#ifdef CONFIG_X86
++      for (k = 0; k < xen_max_nr_phys_cpus; k++) {
++              res = xen_phys_cpus + k;
++              if (!res->parent) /* outside of xen_hypervisor_res range */
++                      insert_resource(&iomem_resource, res);
++      }
++
++      if (xen_create_contiguous_region((unsigned long)&vmcoreinfo_note,
++                                       get_order(sizeof(vmcoreinfo_note)),
++                                       BITS_PER_LONG))
++              goto err;
++#endif
++
++      return;
++
++ err:
++      /*
++       * It isn't possible to free xen_phys_cpus this early in the
++       * boot. Failure at this stage is unexpected and the amount of
++       * memory is small therefore we tolerate the potential leak.
++         */
++      xen_max_nr_phys_cpus = 0;
++      return;
++}
++
++#ifndef CONFIG_X86
++void __init xen_machine_kexec_register_resources(struct resource *res)
++{
++      int k;
++      struct resource *r;
++
++      request_resource(res, &xen_hypervisor_res);
++      for (k = 0; k < xen_max_nr_phys_cpus; k++) {
++              r = xen_phys_cpus + k;
++              if (r->parent == NULL) /* out of xen_hypervisor_res range */
++                      request_resource(res, r);
++      } 
++      machine_kexec_register_resources(res);
++}
++#endif
++
++static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
++{
++      machine_kexec_setup_load_arg(xki, image);
++
++      xki->indirection_page = image->head;
++      xki->start_address = image->start;
++}
++
++/*
++ * Load the image into xen so xen can kdump itself
++ * This might have been done in prepare, but prepare
++ * is currently called too early. It might make sense
++ * to move prepare, but for now, just add an extra hook.
++ */
++int xen_machine_kexec_load(struct kimage *image)
++{
++      xen_kexec_load_t xkl;
++
++      memset(&xkl, 0, sizeof(xkl));
++      xkl.type = image->type;
++      setup_load_arg(&xkl.image, image);
++      return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
++}
++
++/*
++ * Unload the image that was stored by machine_kexec_load()
++ * This might have been done in machine_kexec_cleanup() but it
++ * is called too late, and its possible xen could try and kdump
++ * using resources that have been freed.
++ */
++void xen_machine_kexec_unload(struct kimage *image)
++{
++      xen_kexec_load_t xkl;
++
++      memset(&xkl, 0, sizeof(xkl));
++      xkl.type = image->type;
++      WARN_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl));
++}
++
++/*
++ * Do not allocate memory (or fail in any way) in machine_kexec().
++ * We are past the point of no return, committed to rebooting now.
++ *
++ * This has the hypervisor move to the prefered reboot CPU, 
++ * stop all CPUs and kexec. That is it combines machine_shutdown()
++ * and machine_kexec() in Linux kexec terms.
++ */
++NORET_TYPE void machine_kexec(struct kimage *image)
++{
++      xen_kexec_exec_t xke;
++
++      memset(&xke, 0, sizeof(xke));
++      xke.type = image->type;
++      VOID(HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke));
++      panic("KEXEC_CMD_kexec hypercall should not return\n");
++}
++
++#ifdef CONFIG_X86
++unsigned long paddr_vmcoreinfo_note(void)
++{
++      return virt_to_machine(&vmcoreinfo_note);
++}
++#endif
++
++void machine_shutdown(void)
++{
++      /* do nothing */
++}
++
++void machine_crash_shutdown(struct pt_regs *regs)
++{
++      /* The kernel is broken so disable interrupts */
++      local_irq_disable();
++}
++
++/*
++ * Local variables:
++ *  c-file-style: "linux"
++ *  indent-tabs-mode: t
++ *  c-indent-level: 8
++ *  c-basic-offset: 8
++ *  tab-width: 8
++ * End:
++ */
diff --cc drivers/xen/core/machine_reboot.c

index 0000000,0000000..7350711

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/machine_reboot.c
@@@ -1,0 -1,0 +1,305 @@@
++#include <linux/version.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/unistd.h>
++#include <linux/module.h>
++#include <linux/reboot.h>
++#include <linux/sysrq.h>
++#include <linux/stringify.h>
++#include <linux/stop_machine.h>
++#include <linux/syscore_ops.h>
++#include <asm/irq.h>
++#include <asm/mmu_context.h>
++#include <xen/evtchn.h>
++#include <asm/hypervisor.h>
++#include <xen/xenbus.h>
++#include <linux/cpu.h>
++#include <xen/clock.h>
++#include <xen/gnttab.h>
++#include <xen/xencons.h>
++#include <xen/cpu_hotplug.h>
++#include <xen/interface/vcpu.h>
++#include "../../base/base.h"
++
++#if defined(__i386__) || defined(__x86_64__)
++#include <asm/pci_x86.h>
++/* TBD: Dom0 should propagate the determined value to Xen. */
++bool port_cf9_safe = false;
++
++/*
++ * Power off function, if any
++ */
++void (*pm_power_off)(void);
++EXPORT_SYMBOL(pm_power_off);
++
++void machine_emergency_restart(void)
++{
++      /* We really want to get pending console data out before we die. */
++      xencons_force_flush();
++      HYPERVISOR_shutdown(SHUTDOWN_reboot);
++}
++
++void machine_restart(char * __unused)
++{
++      machine_emergency_restart();
++}
++
++void machine_halt(void)
++{
++      machine_power_off();
++}
++
++void machine_power_off(void)
++{
++      /* We really want to get pending console data out before we die. */
++      xencons_force_flush();
++      if (pm_power_off)
++              pm_power_off();
++      HYPERVISOR_shutdown(SHUTDOWN_poweroff);
++}
++
++#ifdef CONFIG_PM_SLEEP
++static void pre_suspend(void)
++{
++      HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
++      WARN_ON(HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO),
++                                           __pte_ma(0), 0));
++
++      xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
++      xen_start_info->console.domU.mfn =
++              mfn_to_pfn(xen_start_info->console.domU.mfn);
++}
++
++static void post_suspend(int suspend_cancelled, int fast_suspend)
++{
++      int i, j, k, fpp;
++      unsigned long shinfo_mfn;
++      extern unsigned long max_pfn;
++      extern unsigned long *pfn_to_mfn_frame_list_list;
++      extern unsigned long **pfn_to_mfn_frame_list;
++
++      if (suspend_cancelled) {
++              xen_start_info->store_mfn =
++                      pfn_to_mfn(xen_start_info->store_mfn);
++              xen_start_info->console.domU.mfn =
++                      pfn_to_mfn(xen_start_info->console.domU.mfn);
++      } else {
++#ifdef CONFIG_SMP
++              cpumask_copy(vcpu_initialized_mask, cpu_online_mask);
++#endif
++              for_each_possible_cpu(i) {
++                      setup_runstate_area(i);
++
++#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
++                      if (fast_suspend && i != smp_processor_id()
++                          && HYPERVISOR_vcpu_op(VCPUOP_down, i, NULL))
++                              BUG();
++
++                      setup_vcpu_info(i);
++
++                      if (fast_suspend && i != smp_processor_id()
++                          && HYPERVISOR_vcpu_op(VCPUOP_up, i, NULL))
++                              BUG();
++#endif
++              }
++      }
++
++      shinfo_mfn = xen_start_info->shared_info >> PAGE_SHIFT;
++      if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO),
++                                       pfn_pte_ma(shinfo_mfn, PAGE_KERNEL),
++                                       0))
++              BUG();
++      HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
++
++      clear_page(empty_zero_page);
++
++      fpp = PAGE_SIZE/sizeof(unsigned long);
++      for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
++              if ((j % fpp) == 0) {
++                      k++;
++                      pfn_to_mfn_frame_list_list[k] =
++                              virt_to_mfn(pfn_to_mfn_frame_list[k]);
++                      j = 0;
++              }
++              pfn_to_mfn_frame_list[k][j] =
++                      virt_to_mfn(&phys_to_machine_mapping[i]);
++      }
++      HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
++      HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
++              virt_to_mfn(pfn_to_mfn_frame_list_list);
++}
++#endif
++
++#else /* !(defined(__i386__) || defined(__x86_64__)) */
++
++#ifndef HAVE_XEN_PRE_SUSPEND
++#define xen_pre_suspend()     ((void)0)
++#endif
++
++#ifndef HAVE_XEN_POST_SUSPEND
++#define xen_post_suspend(x)   ((void)0)
++#endif
++
++#define switch_idle_mm()      ((void)0)
++#define mm_pin_all()          ((void)0)
++#define pre_suspend()         xen_pre_suspend()
++#define post_suspend(x, f)    xen_post_suspend(x)
++
++#endif
++
++#ifdef CONFIG_PM_SLEEP
++struct suspend {
++      int fast_suspend;
++      void (*resume_notifier)(int);
++};
++
++static int take_machine_down(void *_suspend)
++{
++      struct suspend *suspend = _suspend;
++      int suspend_cancelled;
++      bool sysdev_suspended = false;
++
++      BUG_ON(!irqs_disabled());
++
++      mm_pin_all();
++      suspend_cancelled = sysdev_suspend(PMSG_SUSPEND);
++      if (!suspend_cancelled)
++              suspend_cancelled = syscore_suspend();
++      else
++              sysdev_suspended = true;
++      if (!suspend_cancelled) {
++              pre_suspend();
++
++              /*
++               * This hypercall returns 1 if suspend was cancelled or the domain was
++               * merely checkpointed, and 0 if it is resuming in a new domain.
++               */
++              suspend_cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
++      } else
++              BUG_ON(suspend_cancelled > 0);
++      suspend->resume_notifier(suspend_cancelled);
++      if (suspend_cancelled >= 0)
++              post_suspend(suspend_cancelled, suspend->fast_suspend);
++      if (!suspend_cancelled)
++              xen_clockevents_resume();
++      if (suspend_cancelled >= 0)
++              syscore_resume();
++      if (sysdev_suspended)
++              sysdev_resume();
++      if (!suspend_cancelled) {
++#ifdef __x86_64__
++              /*
++               * Older versions of Xen do not save/restore the user %cr3.
++               * We do it here just in case, but there's no need if we are
++               * in fast-suspend mode as that implies a new enough Xen.
++               */
++              if (!suspend->fast_suspend)
++                      xen_new_user_pt(current->active_mm->pgd);
++#endif
++      }
++
++      return suspend_cancelled;
++}
++
++int __xen_suspend(int fast_suspend, void (*resume_notifier)(int))
++{
++      int err, suspend_cancelled;
++      const char *what;
++      struct suspend suspend;
++
++#define _check(fn, args...) ({ \
++      what = #fn; \
++      err = (fn)(args); \
++})
++
++      BUG_ON(smp_processor_id() != 0);
++      BUG_ON(in_interrupt());
++
++#if defined(__i386__) || defined(__x86_64__)
++      if (xen_feature(XENFEAT_auto_translated_physmap)) {
++              pr_warning("Can't suspend in auto_translated_physmap mode\n");
++              return -EOPNOTSUPP;
++      }
++#endif
++
++      /* If we are definitely UP then 'slow mode' is actually faster. */
++      if (num_possible_cpus() == 1)
++              fast_suspend = 0;
++
++      suspend.fast_suspend = fast_suspend;
++      suspend.resume_notifier = resume_notifier;
++
++      if (_check(dpm_suspend_start, PMSG_SUSPEND)) {
++              dpm_resume_end(PMSG_RESUME);
++              pr_err("%s() failed: %d\n", what, err);
++              return err;
++      }
++
++      if (fast_suspend) {
++              xenbus_suspend();
++
++              if (_check(dpm_suspend_noirq, PMSG_SUSPEND)) {
++                      xenbus_suspend_cancel();
++                      dpm_resume_end(PMSG_RESUME);
++                      pr_err("%s() failed: %d\n", what, err);
++                      return err;
++              }
++
++              err = stop_machine(take_machine_down, &suspend,
++                                 &cpumask_of_cpu(0));
++              if (err < 0)
++                      xenbus_suspend_cancel();
++      } else {
++              BUG_ON(irqs_disabled());
++
++              for (;;) {
++                      xenbus_suspend();
++
++                      if (!_check(dpm_suspend_noirq, PMSG_SUSPEND)
++                          && _check(smp_suspend))
++                              dpm_resume_noirq(PMSG_RESUME);
++                      if (err) {
++                              xenbus_suspend_cancel();
++                              dpm_resume_end(PMSG_RESUME);
++                              pr_err("%s() failed: %d\n", what, err);
++                              return err;
++                      }
++
++                      preempt_disable();
++
++                      if (num_online_cpus() == 1)
++                              break;
++
++                      preempt_enable();
++
++                      dpm_resume_noirq(PMSG_RESUME);
++
++                      xenbus_suspend_cancel();
++              }
++
++              local_irq_disable();
++              err = take_machine_down(&suspend);
++              local_irq_enable();
++      }
++
++      dpm_resume_noirq(PMSG_RESUME);
++
++      if (err >= 0) {
++              suspend_cancelled = err;
++              if (!suspend_cancelled) {
++                      xencons_resume();
++                      xenbus_resume();
++              } else {
++                      xenbus_suspend_cancel();
++                      err = 0;
++              }
++
++              if (!fast_suspend)
++                      smp_resume();
++      }
++
++      dpm_resume_end(PMSG_RESUME);
++
++      return err;
++}
++#endif
diff --cc drivers/xen/core/pci.c

index 0000000,0000000..cc4cb31

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/pci.c
@@@ -1,0 -1,0 +1,83 @@@
++/*
++ * vim:shiftwidth=8:noexpandtab
++ */
++
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/pci.h>
++#include <xen/interface/physdev.h>
++#include "../../pci/pci.h"
++
++static int (*pci_bus_probe)(struct device *dev);
++static int (*pci_bus_remove)(struct device *dev);
++
++static int pci_bus_probe_wrapper(struct device *dev)
++{
++      int r;
++      struct pci_dev *pci_dev = to_pci_dev(dev);
++      struct physdev_manage_pci manage_pci;
++      struct physdev_manage_pci_ext manage_pci_ext;
++
++#ifdef CONFIG_PCI_IOV
++      if (pci_dev->is_virtfn) {
++              memset(&manage_pci_ext, 0, sizeof(manage_pci_ext));
++              manage_pci_ext.bus = pci_dev->bus->number;
++              manage_pci_ext.devfn = pci_dev->devfn;
++              manage_pci_ext.is_virtfn = 1;
++              manage_pci_ext.physfn.bus = pci_dev->physfn->bus->number;
++              manage_pci_ext.physfn.devfn = pci_dev->physfn->devfn;
++              r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
++                                        &manage_pci_ext);
++      } else
++#endif
++      if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
++              memset(&manage_pci_ext, 0, sizeof(manage_pci_ext));
++              manage_pci_ext.bus = pci_dev->bus->number;
++              manage_pci_ext.devfn = pci_dev->devfn;
++              manage_pci_ext.is_extfn = 1;
++              r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
++                                        &manage_pci_ext);
++      } else {
++              manage_pci.bus = pci_dev->bus->number;
++              manage_pci.devfn = pci_dev->devfn;
++              r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
++                                        &manage_pci);
++      }
++      if (r && r != -ENOSYS)
++              return r;
++
++      r = pci_bus_probe(dev);
++      return r;
++}
++
++static int pci_bus_remove_wrapper(struct device *dev)
++{
++      int r;
++      struct pci_dev *pci_dev = to_pci_dev(dev);
++      struct physdev_manage_pci manage_pci;
++      manage_pci.bus = pci_dev->bus->number;
++      manage_pci.devfn = pci_dev->devfn;
++
++      r = pci_bus_remove(dev);
++      /* dev and pci_dev are no longer valid!! */
++
++      WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
++              &manage_pci));
++      return r;
++}
++
++static int __init hook_pci_bus(void)
++{
++      if (!is_running_on_xen() || !is_initial_xendomain())
++              return 0;
++
++      pci_bus_probe = pci_bus_type.probe;
++      pci_bus_type.probe = pci_bus_probe_wrapper;
++
++      pci_bus_remove = pci_bus_type.remove;
++      pci_bus_type.remove = pci_bus_remove_wrapper;
++
++      return 0;
++}
++
++core_initcall(hook_pci_bus);
diff --cc drivers/xen/core/pcpu.c

index 0000000,0000000..1bbe3e8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/pcpu.c
@@@ -1,0 -1,0 +1,462 @@@
++/*
++ * pcpu.c - management physical cpu in dom0 environment
++ */
++#include <linux/acpi.h>
++#include <linux/cpu.h>
++#include <linux/interrupt.h>
++#include <linux/kobject.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/sysdev.h>
++#include <asm/hypervisor.h>
++#include <xen/interface/platform.h>
++#include <xen/evtchn.h>
++#include <xen/pcpu.h>
++#include <acpi/processor.h>
++
++struct pcpu {
++      struct list_head pcpu_list;
++      struct sys_device sysdev;
++      uint32_t xen_id;
++      uint32_t apic_id;
++      uint32_t acpi_id;
++      uint32_t flags;
++};
++
++static inline int xen_pcpu_online(uint32_t flags)
++{
++      return !!(flags & XEN_PCPU_FLAGS_ONLINE);
++}
++
++static DEFINE_MUTEX(xen_pcpu_lock);
++
++/* No need for irq disable since hotplug notify is in workqueue context */
++#define get_pcpu_lock() mutex_lock(&xen_pcpu_lock);
++#define put_pcpu_lock() mutex_unlock(&xen_pcpu_lock);
++
++static LIST_HEAD(xen_pcpus);
++
++static BLOCKING_NOTIFIER_HEAD(pcpu_chain);
++
++static inline void *notifier_param(const struct pcpu *pcpu)
++{
++      return (void *)(unsigned long)pcpu->xen_id;
++}
++
++int register_pcpu_notifier(struct notifier_block *nb)
++{
++      int err;
++
++      get_pcpu_lock();
++
++      err = blocking_notifier_chain_register(&pcpu_chain, nb);
++
++      if (!err) {
++              struct pcpu *pcpu;
++
++              list_for_each_entry(pcpu, &xen_pcpus, pcpu_list)
++                      if (xen_pcpu_online(pcpu->flags))
++                              nb->notifier_call(nb, CPU_ONLINE,
++                                                notifier_param(pcpu));
++      }
++
++      put_pcpu_lock();
++
++      return err;
++}
++EXPORT_SYMBOL_GPL(register_pcpu_notifier);
++
++void unregister_pcpu_notifier(struct notifier_block *nb)
++{
++      get_pcpu_lock();
++      blocking_notifier_chain_unregister(&pcpu_chain, nb);
++      put_pcpu_lock();
++}
++EXPORT_SYMBOL_GPL(unregister_pcpu_notifier);
++
++static int xen_pcpu_down(uint32_t xen_id)
++{
++      xen_platform_op_t op = {
++              .cmd                    = XENPF_cpu_offline,
++              .interface_version      = XENPF_INTERFACE_VERSION,
++              .u.cpu_ol.cpuid = xen_id,
++      };
++
++      return HYPERVISOR_platform_op(&op);
++}
++
++static int xen_pcpu_up(uint32_t xen_id)
++{
++      xen_platform_op_t op = {
++              .cmd                    = XENPF_cpu_online,
++              .interface_version      = XENPF_INTERFACE_VERSION,
++              .u.cpu_ol.cpuid = xen_id,
++      };
++
++      return HYPERVISOR_platform_op(&op);
++}
++
++static ssize_t show_online(struct sys_device *dev,
++                         struct sysdev_attribute *attr,
++                         char *buf)
++{
++      struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
++
++      return sprintf(buf, "%d\n", xen_pcpu_online(cpu->flags));
++}
++
++static ssize_t store_online(struct sys_device *dev,
++                          struct sysdev_attribute *attr,
++                          const char *buf, size_t count)
++{
++      struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
++      ssize_t ret;
++
++      switch (buf[0]) {
++      case '0':
++              ret = xen_pcpu_down(cpu->xen_id);
++              break;
++      case '1':
++              ret = xen_pcpu_up(cpu->xen_id);
++              break;
++      default:
++              ret = -EINVAL;
++      }
++
++      if (ret >= 0)
++              ret = count;
++      return ret;
++}
++
++static SYSDEV_ATTR(online, 0644, show_online, store_online);
++
++static ssize_t show_apicid(struct sys_device *dev,
++                         struct sysdev_attribute *attr,
++                         char *buf)
++{
++      struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
++
++      return sprintf(buf, "%#x\n", cpu->apic_id);
++}
++static SYSDEV_ATTR(apic_id, 0444, show_apicid, NULL);
++
++static ssize_t show_acpiid(struct sys_device *dev,
++                         struct sysdev_attribute *attr,
++                         char *buf)
++{
++      struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
++
++      return sprintf(buf, "%#x\n", cpu->acpi_id);
++}
++static SYSDEV_ATTR(acpi_id, 0444, show_acpiid, NULL);
++
++static struct sysdev_class xen_pcpu_sysdev_class = {
++      .name = "xen_pcpu",
++};
++
++static int xen_pcpu_free(struct pcpu *pcpu)
++{
++      if (!pcpu)
++              return 0;
++
++      sysdev_remove_file(&pcpu->sysdev, &attr_online);
++      sysdev_remove_file(&pcpu->sysdev, &attr_apic_id);
++      sysdev_remove_file(&pcpu->sysdev, &attr_acpi_id);
++      sysdev_unregister(&pcpu->sysdev);
++      list_del(&pcpu->pcpu_list);
++      kfree(pcpu);
++
++      return 0;
++}
++
++static inline int same_pcpu(struct xenpf_pcpuinfo *info,
++                          struct pcpu *pcpu)
++{
++      return (pcpu->apic_id == info->apic_id) &&
++              (pcpu->xen_id == info->xen_cpuid);
++}
++
++/*
++ * Return 1 if online status changed
++ */
++static int xen_pcpu_online_check(struct xenpf_pcpuinfo *info,
++                               struct pcpu *pcpu)
++{
++      int result = 0;
++
++      if (info->xen_cpuid != pcpu->xen_id)
++              return 0;
++
++      if (xen_pcpu_online(info->flags) && !xen_pcpu_online(pcpu->flags)) {
++              /* the pcpu is onlined */
++              pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
++              blocking_notifier_call_chain(&pcpu_chain, CPU_ONLINE,
++                                           notifier_param(pcpu));
++              kobject_uevent(&pcpu->sysdev.kobj, KOBJ_ONLINE);
++              result = 1;
++      } else if (!xen_pcpu_online(info->flags) &&
++                 xen_pcpu_online(pcpu->flags))  {
++              /* The pcpu is offlined now */
++              pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
++              blocking_notifier_call_chain(&pcpu_chain, CPU_DEAD,
++                                           notifier_param(pcpu));
++              kobject_uevent(&pcpu->sysdev.kobj, KOBJ_OFFLINE);
++              result = 1;
++      }
++
++      return result;
++}
++
++static int pcpu_sysdev_init(struct pcpu *cpu)
++{
++      int error;
++
++      error = sysdev_register(&cpu->sysdev);
++      if (error) {
++              pr_warn("xen_pcpu_add: Failed to register pcpu\n");
++              kfree(cpu);
++              return -1;
++      }
++      sysdev_create_file(&cpu->sysdev, &attr_online);
++      sysdev_create_file(&cpu->sysdev, &attr_apic_id);
++      sysdev_create_file(&cpu->sysdev, &attr_acpi_id);
++      return 0;
++}
++
++static struct pcpu *get_pcpu(unsigned int xen_id)
++{
++      struct pcpu *pcpu;
++
++      list_for_each_entry(pcpu, &xen_pcpus, pcpu_list)
++              if (pcpu->xen_id == xen_id)
++                      return pcpu;
++
++      return NULL;
++}
++
++static struct pcpu *init_pcpu(struct xenpf_pcpuinfo *info)
++{
++      struct pcpu *pcpu;
++
++      if (info->flags & XEN_PCPU_FLAGS_INVALID)
++              return NULL;
++
++      /* The PCPU is just added */
++      pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
++      if (!pcpu)
++              return NULL;
++
++      INIT_LIST_HEAD(&pcpu->pcpu_list);
++      pcpu->xen_id = info->xen_cpuid;
++      pcpu->apic_id = info->apic_id;
++      pcpu->acpi_id = info->acpi_id;
++      pcpu->flags = info->flags;
++
++      pcpu->sysdev.cls = &xen_pcpu_sysdev_class;
++      pcpu->sysdev.id = info->xen_cpuid;
++
++      if (pcpu_sysdev_init(pcpu)) {
++              kfree(pcpu);
++              return NULL;
++      }
++
++      list_add_tail(&pcpu->pcpu_list, &xen_pcpus);
++      return pcpu;
++}
++
++#define PCPU_NO_CHANGE                        0
++#define PCPU_ADDED                    1
++#define PCPU_ONLINE_OFFLINE           2
++#define PCPU_REMOVED                  3
++/*
++ * Caller should hold the pcpu lock
++ * < 0: Something wrong
++ * 0: No changes
++ * > 0: State changed
++ */
++static struct pcpu *_sync_pcpu(unsigned int cpu_num, unsigned int *max_id,
++                             int *result)
++{
++      struct pcpu *pcpu;
++      struct xenpf_pcpuinfo *info;
++      xen_platform_op_t op = {
++              .cmd                = XENPF_get_cpuinfo,
++              .interface_version  = XENPF_INTERFACE_VERSION,
++      };
++      int ret;
++
++      *result = -1;
++
++      info = &op.u.pcpu_info;
++      info->xen_cpuid = cpu_num;
++
++      do {
++              ret = HYPERVISOR_platform_op(&op);
++      } while (ret == -EBUSY);
++      if (ret)
++              return NULL;
++
++      if (max_id)
++              *max_id = op.u.pcpu_info.max_present;
++
++      pcpu = get_pcpu(cpu_num);
++
++      if (info->flags & XEN_PCPU_FLAGS_INVALID) {
++              /* The pcpu has been removed */
++              *result = PCPU_NO_CHANGE;
++              if (pcpu) {
++                      xen_pcpu_free(pcpu);
++                      *result = PCPU_REMOVED;
++              }
++              return NULL;
++      }
++
++
++      if (!pcpu) {
++              *result = PCPU_ADDED;
++              pcpu = init_pcpu(info);
++              if (pcpu == NULL) {
++                      pr_warn("Failed to init pcpu %x\n", info->xen_cpuid);
++                      *result = -1;
++              }
++      } else {
++              *result = PCPU_NO_CHANGE;
++              /*
++               * Old PCPU is replaced with a new pcpu, this means
++               * several virq is missed, will it happen?
++               */
++              if (!same_pcpu(info, pcpu)) {
++                      pr_warn("Pcpu %x changed!\n", pcpu->xen_id);
++                      pcpu->apic_id = info->apic_id;
++                      pcpu->acpi_id = info->acpi_id;
++              }
++              if (xen_pcpu_online_check(info, pcpu))
++                      *result = PCPU_ONLINE_OFFLINE;
++      }
++      return pcpu;
++}
++
++/*
++ * Sync dom0's pcpu information with xen hypervisor's
++ */
++static int xen_sync_pcpus(void)
++{
++      /*
++       * Boot cpu always have cpu_id 0 in xen
++       */
++      unsigned int cpu_num = 0, max_id = 0;
++      int result = 0;
++      struct pcpu *pcpu;
++
++      get_pcpu_lock();
++
++      while ((result >= 0) && (cpu_num <= max_id)) {
++              pcpu = _sync_pcpu(cpu_num, &max_id, &result);
++
++              switch (result) {
++              case PCPU_NO_CHANGE:
++              case PCPU_ADDED:
++              case PCPU_ONLINE_OFFLINE:
++              case PCPU_REMOVED:
++                      break;
++              default:
++                      pr_warn("Failed to sync pcpu %x\n", cpu_num);
++                      break;
++              }
++              cpu_num++;
++      }
++
++      if (result < 0) {
++              struct pcpu *tmp;
++
++              list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, pcpu_list)
++                      xen_pcpu_free(pcpu);
++      }
++
++      put_pcpu_lock();
++
++      return 0;
++}
++
++static void xen_pcpu_dpc(struct work_struct *work)
++{
++      if (xen_sync_pcpus() < 0)
++              pr_warn("xen_pcpu_dpc: Failed to sync pcpu information\n");
++}
++static DECLARE_WORK(xen_pcpu_work, xen_pcpu_dpc);
++
++static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
++{
++      schedule_work(&xen_pcpu_work);
++
++      return IRQ_HANDLED;
++}
++
++#ifdef CONFIG_ACPI_HOTPLUG_CPU
++
++int xen_pcpu_hotplug(int type)
++{
++      schedule_work(&xen_pcpu_work);
++
++      return 0;
++}
++EXPORT_SYMBOL_GPL(xen_pcpu_hotplug);
++
++int xen_pcpu_index(uint32_t id, bool is_acpiid)
++{
++      unsigned int cpu_num, max_id;
++      xen_platform_op_t op = {
++              .cmd                = XENPF_get_cpuinfo,
++              .interface_version  = XENPF_INTERFACE_VERSION,
++      };
++      struct xenpf_pcpuinfo *info = &op.u.pcpu_info;
++
++      for (max_id = cpu_num = 0; cpu_num <= max_id; ++cpu_num) {
++              int ret;
++
++              info->xen_cpuid = cpu_num;
++              do {
++                      ret = HYPERVISOR_platform_op(&op);
++              } while (ret == -EBUSY);
++              if (ret)
++                      continue;
++
++              if (info->max_present > max_id)
++                      max_id = info->max_present;
++              if (id == (is_acpiid ? info->acpi_id : info->apic_id))
++                      return cpu_num;
++      }
++
++      return -1;
++}
++EXPORT_SYMBOL_GPL(xen_pcpu_index);
++
++#endif /* CONFIG_ACPI_HOTPLUG_CPU */
++
++static int __init xen_pcpu_init(void)
++{
++      int err;
++
++      if (!is_initial_xendomain())
++              return 0;
++
++      err = sysdev_class_register(&xen_pcpu_sysdev_class);
++      if (err) {
++              pr_warn("xen_pcpu_init: "
++                      "Failed to register sysdev class (%d)\n", err);
++              return err;
++      }
++
++      xen_sync_pcpus();
++
++      if (!list_empty(&xen_pcpus))
++              err = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0,
++                                            xen_pcpu_interrupt, 0,
++                                            "pcpu", NULL);
++      if (err < 0)
++              pr_warn("xen_pcpu_init: "
++                      "Failed to bind pcpu_state virq (%d)\n", err);
++
++      return err;
++}
++subsys_initcall(xen_pcpu_init);
diff --cc drivers/xen/core/reboot.c

index 0000000,0000000..ce7a305

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/reboot.c
@@@ -1,0 -1,0 +1,351 @@@
++#include <linux/version.h>
++#include <linux/kernel.h>
++#include <linux/unistd.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/reboot.h>
++#include <linux/sched.h>
++#include <linux/sysrq.h>
++#include <asm/hypervisor.h>
++#include <xen/xenbus.h>
++#include <xen/evtchn.h>
++#include <linux/kmod.h>
++#include <linux/slab.h>
++#include <linux/workqueue.h>
++
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#undef handle_sysrq
++#endif
++
++MODULE_LICENSE("Dual BSD/GPL");
++
++#define SHUTDOWN_INVALID  -1
++#define SHUTDOWN_POWEROFF  0
++#define SHUTDOWN_SUSPEND   2
++#define SHUTDOWN_RESUMING  3
++#define SHUTDOWN_HALT      4
++
++/* Ignore multiple shutdown requests. */
++static int shutting_down = SHUTDOWN_INVALID;
++
++/* Can we leave APs online when we suspend? */
++static int fast_suspend;
++
++static void __shutdown_handler(struct work_struct *unused);
++static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
++
++int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
++
++static int shutdown_process(void *__unused)
++{
++      static char *envp[] = { "HOME=/", "TERM=linux",
++                              "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
++      static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
++
++      extern asmlinkage long sys_reboot(int magic1, int magic2,
++                                        unsigned int cmd, void *arg);
++
++      if ((shutting_down == SHUTDOWN_POWEROFF) ||
++          (shutting_down == SHUTDOWN_HALT)) {
++              if (call_usermodehelper("/sbin/poweroff", poweroff_argv,
++                                      envp, 0) < 0) {
++#ifdef CONFIG_XEN
++                      sys_reboot(LINUX_REBOOT_MAGIC1,
++                                 LINUX_REBOOT_MAGIC2,
++                                 LINUX_REBOOT_CMD_POWER_OFF,
++                                 NULL);
++#endif /* CONFIG_XEN */
++              }
++      }
++
++      shutting_down = SHUTDOWN_INVALID; /* could try again */
++
++      return 0;
++}
++
++#ifdef CONFIG_PM_SLEEP
++
++static int setup_suspend_evtchn(void);
++
++/* Was last suspend request cancelled? */
++static int suspend_cancelled;
++
++static void xen_resume_notifier(int _suspend_cancelled)
++{
++      int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
++      BUG_ON(old_state != SHUTDOWN_SUSPEND);
++      suspend_cancelled = _suspend_cancelled;
++}
++
++static int xen_suspend(void *__unused)
++{
++      int err, old_state;
++
++      daemonize("suspend");
++      err = set_cpus_allowed_ptr(current, cpumask_of(0));
++      if (err) {
++              pr_err("Xen suspend can't run on CPU0 (%d)\n", err);
++              goto fail;
++      }
++
++      do {
++              err = __xen_suspend(fast_suspend, xen_resume_notifier);
++              if (err) {
++                      pr_err("Xen suspend failed (%d)\n", err);
++                      goto fail;
++              }
++              if (!suspend_cancelled)
++                      setup_suspend_evtchn();
++              old_state = cmpxchg(
++                      &shutting_down, SHUTDOWN_RESUMING, SHUTDOWN_INVALID);
++      } while (old_state == SHUTDOWN_SUSPEND);
++
++      switch (old_state) {
++      case SHUTDOWN_INVALID:
++      case SHUTDOWN_SUSPEND:
++              BUG();
++      case SHUTDOWN_RESUMING:
++              break;
++      default:
++              schedule_delayed_work(&shutdown_work, 0);
++              break;
++      }
++
++      return 0;
++
++ fail:
++      old_state = xchg(&shutting_down, SHUTDOWN_INVALID);
++      BUG_ON(old_state != SHUTDOWN_SUSPEND);
++      return 0;
++}
++
++#else
++# define xen_suspend NULL
++#endif
++
++static void switch_shutdown_state(int new_state)
++{
++      int prev_state, old_state = SHUTDOWN_INVALID;
++
++      /* We only drive shutdown_state into an active state. */
++      if (new_state == SHUTDOWN_INVALID)
++              return;
++
++      do {
++              /* We drop this transition if already in an active state. */
++              if ((old_state != SHUTDOWN_INVALID) &&
++                  (old_state != SHUTDOWN_RESUMING))
++                      return;
++              /* Attempt to transition. */
++              prev_state = old_state;
++              old_state = cmpxchg(&shutting_down, old_state, new_state);
++      } while (old_state != prev_state);
++
++      /* Either we kick off the work, or we leave it to xen_suspend(). */
++      if (old_state == SHUTDOWN_INVALID)
++              schedule_delayed_work(&shutdown_work, 0);
++      else
++              BUG_ON(old_state != SHUTDOWN_RESUMING);
++}
++
++static void __shutdown_handler(struct work_struct *unused)
++{
++      int err;
++
++      err = kernel_thread((shutting_down == SHUTDOWN_SUSPEND) ?
++                          xen_suspend : shutdown_process,
++                          NULL, CLONE_FS | CLONE_FILES);
++
++      if (err < 0) {
++              pr_warning("Error creating shutdown process (%d): "
++                         "retrying...\n", -err);
++              schedule_delayed_work(&shutdown_work, HZ/2);
++      }
++}
++
++static void shutdown_handler(struct xenbus_watch *watch,
++                           const char **vec, unsigned int len)
++{
++      extern void ctrl_alt_del(void);
++      char *str;
++      struct xenbus_transaction xbt;
++      int err, new_state = SHUTDOWN_INVALID;
++
++      if ((shutting_down != SHUTDOWN_INVALID) &&
++          (shutting_down != SHUTDOWN_RESUMING))
++              return;
++
++ again:
++      err = xenbus_transaction_start(&xbt);
++      if (err)
++              return;
++
++      str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
++      /* Ignore read errors and empty reads. */
++      if (XENBUS_IS_ERR_READ(str)) {
++              xenbus_transaction_end(xbt, 1);
++              return;
++      }
++
++      xenbus_write(xbt, "control", "shutdown", "");
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err == -EAGAIN) {
++              kfree(str);
++              goto again;
++      }
++
++      if (strcmp(str, "poweroff") == 0)
++              new_state = SHUTDOWN_POWEROFF;
++      else if (strcmp(str, "reboot") == 0)
++              ctrl_alt_del();
++#ifdef CONFIG_PM_SLEEP
++      else if (strcmp(str, "suspend") == 0)
++              new_state = SHUTDOWN_SUSPEND;
++#endif
++      else if (strcmp(str, "halt") == 0)
++              new_state = SHUTDOWN_HALT;
++      else
++              pr_warning("Ignoring shutdown request: %s\n", str);
++
++      switch_shutdown_state(new_state);
++
++      kfree(str);
++}
++
++static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
++                        unsigned int len)
++{
++      char sysrq_key = '\0';
++      struct xenbus_transaction xbt;
++      int err;
++
++ again:
++      err = xenbus_transaction_start(&xbt);
++      if (err)
++              return;
++      if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
++              pr_err("Unable to read sysrq code in control/sysrq\n");
++              xenbus_transaction_end(xbt, 1);
++              return;
++      }
++
++      if (sysrq_key != '\0')
++              xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err == -EAGAIN)
++              goto again;
++
++#ifdef CONFIG_MAGIC_SYSRQ
++      if (sysrq_key != '\0')
++              handle_sysrq(sysrq_key);
++#endif
++}
++
++static struct xenbus_watch shutdown_watch = {
++      .node = "control/shutdown",
++      .callback = shutdown_handler
++};
++
++static struct xenbus_watch sysrq_watch = {
++      .node = "control/sysrq",
++      .callback = sysrq_handler
++};
++
++#ifdef CONFIG_PM_SLEEP
++static irqreturn_t suspend_int(int irq, void* dev_id)
++{
++      switch_shutdown_state(SHUTDOWN_SUSPEND);
++      return IRQ_HANDLED;
++}
++
++static int setup_suspend_evtchn(void)
++{
++      static int irq;
++      int port;
++      char portstr[16];
++
++      if (irq > 0)
++              unbind_from_irqhandler(irq, NULL);
++
++      irq = bind_listening_port_to_irqhandler(0, suspend_int, 0, "suspend",
++                                              NULL);
++      if (irq <= 0)
++              return -1;
++
++      port = irq_to_evtchn_port(irq);
++      pr_info("suspend: event channel %d\n", port);
++      sprintf(portstr, "%d", port);
++      xenbus_write(XBT_NIL, "device/suspend", "event-channel", portstr);
++
++      return 0;
++}
++#else
++#define setup_suspend_evtchn() 0
++#endif
++
++static int setup_shutdown_watcher(void)
++{
++      int err;
++
++      err = register_xenbus_watch(&sysrq_watch);
++      if (err) {
++              pr_err("Failed to set sysrq watcher\n");
++              return err;
++      }
++
++      if (is_initial_xendomain())
++              return 0;
++
++      xenbus_scanf(XBT_NIL, "control",
++                   "platform-feature-multiprocessor-suspend",
++                   "%d", &fast_suspend);
++
++      err = register_xenbus_watch(&shutdown_watch);
++      if (err) {
++              pr_err("Failed to set shutdown watcher\n");
++              return err;
++      }
++
++      /* suspend event channel */
++      err = setup_suspend_evtchn();
++      if (err) {
++              pr_err("Failed to register suspend event channel\n");
++              return err;
++      }
++
++      return 0;
++}
++
++#ifdef CONFIG_XEN
++
++static int shutdown_event(struct notifier_block *notifier,
++                        unsigned long event,
++                        void *data)
++{
++      setup_shutdown_watcher();
++      return NOTIFY_DONE;
++}
++
++static int __init setup_shutdown_event(void)
++{
++      static struct notifier_block xenstore_notifier = {
++              .notifier_call = shutdown_event
++      };
++      register_xenstore_notifier(&xenstore_notifier);
++
++      return 0;
++}
++
++subsys_initcall(setup_shutdown_event);
++
++#else /* !defined(CONFIG_XEN) */
++
++int xen_reboot_init(void)
++{
++      return setup_shutdown_watcher();
++}
++
++#endif /* !defined(CONFIG_XEN) */
diff --cc drivers/xen/core/smpboot.c

index 0000000,0000000..13f21e2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/smpboot.c
@@@ -1,0 -1,0 +1,399 @@@
++/*
++ *    Xen SMP booting functions
++ *
++ *    See arch/i386/kernel/smpboot.c for copyright and credits for derived
++ *    portions of this file.
++ */
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/sched.h>
++#include <linux/kernel_stat.h>
++#include <linux/irq.h>
++#include <linux/bootmem.h>
++#include <linux/notifier.h>
++#include <linux/cpu.h>
++#include <linux/percpu.h>
++#include <asm/desc.h>
++#include <asm/pgalloc.h>
++#include <xen/clock.h>
++#include <xen/evtchn.h>
++#include <xen/interface/vcpu.h>
++#include <xen/cpu_hotplug.h>
++#include <xen/xenbus.h>
++
++extern int local_setup_timer(unsigned int cpu);
++extern void local_teardown_timer(unsigned int cpu);
++
++extern void hypervisor_callback(void);
++extern void failsafe_callback(void);
++extern void system_call(void);
++extern void smp_trap_init(trap_info_t *);
++
++cpumask_var_t vcpu_initialized_mask;
++
++DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
++EXPORT_PER_CPU_SYMBOL(cpu_info);
++
++static int __read_mostly ipi_irq = -1;
++
++void __init prefill_possible_map(void)
++{
++      int i, rc;
++
++      for_each_possible_cpu(i)
++          if (i != smp_processor_id())
++              return;
++
++      for (i = 0; i < NR_CPUS; i++) {
++#ifndef CONFIG_HOTPLUG_CPU
++              if (i >= setup_max_cpus)
++                      break;
++#endif
++              rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
++              if (rc >= 0) {
++                      set_cpu_possible(i, true);
++                      nr_cpu_ids = i + 1;
++              }
++      }
++      total_cpus = num_possible_cpus();
++      for (; HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL) >= 0; ++i)
++              if (i != smp_processor_id())
++                      ++total_cpus;
++}
++
++static irqreturn_t ipi_interrupt(int irq, void *dev_id)
++{
++      static void (*const handlers[])(struct pt_regs *) = {
++              [RESCHEDULE_VECTOR] = smp_reschedule_interrupt,
++              [CALL_FUNCTION_VECTOR] = smp_call_function_interrupt,
++              [CALL_FUNC_SINGLE_VECTOR] = smp_call_function_single_interrupt,
++              [REBOOT_VECTOR] = smp_reboot_interrupt,
++#ifdef CONFIG_IRQ_WORK
++              [IRQ_WORK_VECTOR] = smp_irq_work_interrupt,
++#endif
++      };
++      unsigned long *pending = __get_cpu_var(ipi_pending);
++      struct pt_regs *regs = get_irq_regs();
++      irqreturn_t ret = IRQ_NONE;
++
++      for (;;) {
++              unsigned int ipi = find_first_bit(pending, NR_IPIS);
++
++              if (ipi >= NR_IPIS) {
++                      clear_ipi_evtchn();
++                      ipi = find_first_bit(pending, NR_IPIS);
++              }
++              if (ipi >= NR_IPIS)
++                      return ret;
++              ret = IRQ_HANDLED;
++              do {
++                      clear_bit(ipi, pending);
++                      handlers[ipi](regs);
++                      ipi = find_next_bit(pending, NR_IPIS, ipi);
++              } while (ipi < NR_IPIS);
++      }
++}
++
++static int __cpuinit xen_smp_intr_init(unsigned int cpu)
++{
++      static struct irqaction ipi_action = {
++              .handler = ipi_interrupt,
++              .flags   = IRQF_DISABLED,
++              .name    = "ipi"
++      };
++      int rc;
++
++      rc = bind_ipi_to_irqaction(cpu, &ipi_action);
++      if (rc < 0)
++              return rc;
++      if (ipi_irq < 0)
++              ipi_irq = rc;
++      else
++              BUG_ON(ipi_irq != rc);
++
++      rc = xen_spinlock_init(cpu);
++      if (rc < 0)
++              goto unbind_ipi;
++
++      if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
++              goto fail;
++
++      return 0;
++
++ fail:
++      xen_spinlock_cleanup(cpu);
++ unbind_ipi:
++      unbind_from_per_cpu_irq(ipi_irq, cpu, NULL);
++      return rc;
++}
++
++#ifdef CONFIG_HOTPLUG_CPU
++static void __cpuinit xen_smp_intr_exit(unsigned int cpu)
++{
++      if (cpu != 0)
++              local_teardown_timer(cpu);
++
++      unbind_from_per_cpu_irq(ipi_irq, cpu, NULL);
++      xen_spinlock_cleanup(cpu);
++}
++#endif
++
++static void __cpuinit cpu_bringup(void)
++{
++      cpu_init();
++      identify_secondary_cpu(__this_cpu_ptr(&cpu_info));
++      touch_softlockup_watchdog();
++      preempt_disable();
++      xen_setup_cpu_clockevents();
++      local_irq_enable();
++}
++
++static void __cpuinit cpu_bringup_and_idle(void)
++{
++      cpu_bringup();
++      cpu_idle();
++}
++
++static void __cpuinit cpu_initialize_context(unsigned int cpu)
++{
++      /* vcpu_guest_context_t is too large to allocate on the stack.
++       * Hence we allocate statically and protect it with a lock */
++      static vcpu_guest_context_t ctxt;
++      static DEFINE_SPINLOCK(ctxt_lock);
++
++      struct task_struct *idle = idle_task(cpu);
++
++      if (cpumask_test_and_set_cpu(cpu, vcpu_initialized_mask))
++              return;
++
++      spin_lock(&ctxt_lock);
++
++      memset(&ctxt, 0, sizeof(ctxt));
++
++      ctxt.flags = VGCF_IN_KERNEL;
++      ctxt.user_regs.ds = __USER_DS;
++      ctxt.user_regs.es = __USER_DS;
++      ctxt.user_regs.ss = __KERNEL_DS;
++      ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle;
++      ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
++
++      smp_trap_init(ctxt.trap_ctxt);
++
++      ctxt.gdt_frames[0] = arbitrary_virt_to_mfn(get_cpu_gdt_table(cpu));
++      ctxt.gdt_ents = GDT_SIZE / 8;
++
++      ctxt.user_regs.cs = __KERNEL_CS;
++      ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
++
++      ctxt.kernel_ss = __KERNEL_DS;
++      ctxt.kernel_sp = idle->thread.sp0;
++
++      ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
++      ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
++#ifdef __i386__
++      ctxt.event_callback_cs     = __KERNEL_CS;
++      ctxt.failsafe_callback_cs  = __KERNEL_CS;
++
++      ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
++
++      ctxt.user_regs.fs = __KERNEL_PERCPU;
++      ctxt.user_regs.gs = __KERNEL_STACK_CANARY;
++#else /* __x86_64__ */
++      ctxt.syscall_callback_eip  = (unsigned long)system_call;
++
++      ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
++
++      ctxt.gs_base_kernel = per_cpu_offset(cpu);
++#endif
++
++      if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt))
++              BUG();
++
++      spin_unlock(&ctxt_lock);
++}
++
++void __init smp_prepare_cpus(unsigned int max_cpus)
++{
++      unsigned int cpu;
++      struct task_struct *idle;
++      int apicid;
++      struct vcpu_get_physid cpu_id;
++      void *gdt_addr;
++
++      apicid = 0;
++      if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, 0, &cpu_id) == 0)
++              apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
++      cpu_data(0) = boot_cpu_data;
++      current_thread_info()->cpu = 0;
++
++      if (xen_smp_intr_init(0))
++              BUG();
++
++      if (!alloc_cpumask_var(&vcpu_initialized_mask, GFP_KERNEL))
++              BUG();
++      cpumask_copy(vcpu_initialized_mask, cpumask_of(0));
++
++      /* Restrict the possible_map according to max_cpus. */
++      while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
++              for (cpu = nr_cpu_ids-1; !cpu_possible(cpu); cpu--)
++                      continue;
++              set_cpu_possible(cpu, false);
++      }
++
++      for_each_possible_cpu (cpu) {
++              if (cpu == 0)
++                      continue;
++
++              idle = fork_idle(cpu);
++              if (IS_ERR(idle))
++                      panic("failed fork for CPU %d", cpu);
++
++              gdt_addr = get_cpu_gdt_table(cpu);
++              make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
++
++              apicid = cpu;
++              if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0)
++                      apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
++              cpu_data(cpu) = boot_cpu_data;
++              cpu_data(cpu).cpu_index = cpu;
++
++#ifdef __x86_64__
++              clear_tsk_thread_flag(idle, TIF_FORK);
++              per_cpu(kernel_stack, cpu) =
++                      (unsigned long)task_stack_page(idle) -
++                      KERNEL_STACK_OFFSET + THREAD_SIZE;
++#endif
++              per_cpu(current_task, cpu) = idle;
++
++              irq_ctx_init(cpu);
++
++#ifdef CONFIG_HOTPLUG_CPU
++              if (is_initial_xendomain())
++#endif
++                      set_cpu_present(cpu, true);
++      }
++
++      init_xenbus_allowed_cpumask();
++
++#ifdef CONFIG_X86_IO_APIC
++      /*
++       * Here we can be sure that there is an IO-APIC in the system. Let's
++       * go and set it up:
++       */
++      if (cpu_has_apic && !skip_ioapic_setup && nr_ioapics)
++              setup_IO_APIC();
++#endif
++}
++
++void __init smp_prepare_boot_cpu(void)
++{
++      unsigned int cpu;
++
++      switch_to_new_gdt(smp_processor_id());
++      prefill_possible_map();
++      for_each_possible_cpu(cpu)
++              if (cpu != smp_processor_id())
++                      setup_vcpu_info(cpu);
++}
++
++#ifdef CONFIG_HOTPLUG_CPU
++
++/*
++ * Initialize cpu_present_map late to skip SMP boot code in init/main.c.
++ * But do it early enough to catch critical for_each_present_cpu() loops
++ * in i386-specific code.
++ */
++static int __init initialize_cpu_present_map(void)
++{
++      unsigned int cpu;
++
++      for_each_possible_cpu(cpu)
++              set_cpu_present(cpu, true);
++
++      return 0;
++}
++core_initcall(initialize_cpu_present_map);
++
++int __cpuinit __cpu_disable(void)
++{
++      unsigned int cpu = smp_processor_id();
++
++      if (cpu == 0)
++              return -EBUSY;
++
++      set_cpu_online(cpu, false);
++      fixup_irqs();
++
++      return 0;
++}
++
++void __cpuinit __cpu_die(unsigned int cpu)
++{
++      while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
++              current->state = TASK_UNINTERRUPTIBLE;
++              schedule_timeout(HZ/10);
++      }
++
++      xen_smp_intr_exit(cpu);
++
++      if (num_online_cpus() == 1)
++              alternatives_smp_switch(0);
++}
++
++#endif /* CONFIG_HOTPLUG_CPU */
++
++int __cpuinit __cpu_up(unsigned int cpu)
++{
++      int rc;
++
++      rc = cpu_up_check(cpu);
++      if (rc)
++              return rc;
++
++      cpu_initialize_context(cpu);
++
++      if (num_online_cpus() == 1)
++              alternatives_smp_switch(1);
++
++      /* This must be done before setting cpu_online_map */
++      wmb();
++
++      rc = xen_smp_intr_init(cpu);
++      if (rc)
++              return rc;
++
++      set_cpu_online(cpu, true);
++
++      rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
++      BUG_ON(rc);
++
++      return 0;
++}
++
++void __ref play_dead(void)
++{
++      idle_task_exit();
++      local_irq_disable();
++      cpumask_clear_cpu(smp_processor_id(), cpu_initialized_mask);
++      preempt_enable_no_resched();
++      VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
++#ifdef CONFIG_HOTPLUG_CPU
++      cpu_bringup();
++#else
++      BUG();
++#endif
++}
++
++void __init smp_cpus_done(unsigned int max_cpus)
++{
++}
++
++#ifndef CONFIG_X86_LOCAL_APIC
++int setup_profiling_timer(unsigned int multiplier)
++{
++      return -EINVAL;
++}
++#endif
diff --cc drivers/xen/core/spinlock.c

index 0000000,0000000..034e5a1

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/spinlock.c
@@@ -1,0 -1,0 +1,313 @@@
++/*
++ *    Xen spinlock functions
++ *
++ *    See arch/x86/xen/smp.c for copyright and credits for derived
++ *    portions of this file.
++ */
++#define XEN_SPINLOCK_SOURCE
++#include <linux/spinlock_types.h>
++
++#ifdef TICKET_SHIFT
++
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <asm/hardirq.h>
++#include <xen/clock.h>
++#include <xen/evtchn.h>
++
++struct spinning {
++      arch_spinlock_t *lock;
++      unsigned int ticket;
++      struct spinning *prev;
++};
++static DEFINE_PER_CPU(struct spinning *, _spinning);
++static DEFINE_PER_CPU_READ_MOSTLY(evtchn_port_t, poll_evtchn);
++/*
++ * Protect removal of objects: Addition can be done lockless, and even
++ * removal itself doesn't need protection - what needs to be prevented is
++ * removed objects going out of scope (as they're allocated on the stack).
++ */
++struct rm_seq {
++      unsigned int idx;
++      atomic_t ctr[2];
++};
++static DEFINE_PER_CPU(struct rm_seq, rm_seq);
++
++int __cpuinit xen_spinlock_init(unsigned int cpu)
++{
++      struct evtchn_bind_ipi bind_ipi;
++      int rc;
++
++      setup_runstate_area(cpu);
++
++      WARN_ON(per_cpu(poll_evtchn, cpu));
++      bind_ipi.vcpu = cpu;
++      rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi);
++      if (!rc)
++              per_cpu(poll_evtchn, cpu) = bind_ipi.port;
++      else
++              pr_warning("No spinlock poll event channel for CPU#%u (%d)\n",
++                         cpu, rc);
++
++      return rc;
++}
++
++void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
++{
++      struct evtchn_close close;
++
++      close.port = per_cpu(poll_evtchn, cpu);
++      per_cpu(poll_evtchn, cpu) = 0;
++      WARN_ON(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close));
++}
++
++#ifdef CONFIG_PM_SLEEP
++#include <linux/syscore_ops.h>
++
++static void __cpuinit spinlock_resume(void)
++{
++      unsigned int cpu;
++
++      for_each_online_cpu(cpu) {
++              per_cpu(poll_evtchn, cpu) = 0;
++              xen_spinlock_init(cpu);
++      }
++}
++
++static struct syscore_ops __cpuinitdata spinlock_syscore_ops = {
++      .resume = spinlock_resume
++};
++
++static int __init spinlock_register(void)
++{
++      if (!is_initial_xendomain())
++              register_syscore_ops(&spinlock_syscore_ops);
++      return 0;
++}
++core_initcall(spinlock_register);
++#endif
++
++static unsigned int spin_adjust(struct spinning *spinning,
++                              const arch_spinlock_t *lock,
++                              unsigned int token)
++{
++      for (; spinning; spinning = spinning->prev)
++              if (spinning->lock == lock) {
++                      unsigned int ticket = spinning->ticket;
++
++                      if (unlikely(!(ticket + 1)))
++                              break;
++                      spinning->ticket = token >> TICKET_SHIFT;
++                      token = (token & ((1 << TICKET_SHIFT) - 1))
++                              | (ticket << TICKET_SHIFT);
++                      break;
++              }
++
++      return token;
++}
++
++unsigned int xen_spin_adjust(const arch_spinlock_t *lock, unsigned int token)
++{
++      return spin_adjust(percpu_read(_spinning), lock, token);
++}
++
++unsigned int xen_spin_wait(arch_spinlock_t *lock, unsigned int *ptok,
++                         unsigned int flags)
++{
++      unsigned int rm_idx, cpu = raw_smp_processor_id();
++      bool rc;
++      typeof(vcpu_info(0)->evtchn_upcall_mask) upcall_mask;
++      struct spinning spinning, *other;
++
++      /* If kicker interrupt not initialized yet, just spin. */
++      if (unlikely(!cpu_online(cpu)) || unlikely(!percpu_read(poll_evtchn)))
++              return UINT_MAX;
++
++      /* announce we're spinning */
++      spinning.ticket = *ptok >> TICKET_SHIFT;
++      spinning.lock = lock;
++      spinning.prev = percpu_read(_spinning);
++      smp_wmb();
++      percpu_write(_spinning, &spinning);
++      upcall_mask = vcpu_info_read(evtchn_upcall_mask);
++
++      do {
++              bool nested = false;
++
++              clear_evtchn(percpu_read(poll_evtchn));
++
++              /*
++               * Check again to make sure it didn't become free while
++               * we weren't looking.
++               */
++              if (lock->cur == spinning.ticket) {
++                      lock->owner = cpu;
++                      /*
++                       * If we interrupted another spinlock while it was
++                       * blocking, make sure it doesn't block (again)
++                       * without rechecking the lock.
++                       */
++                      if (spinning.prev)
++                              set_evtchn(percpu_read(poll_evtchn));
++                      rc = true;
++                      break;
++              }
++
++              for (other = spinning.prev; other; other = other->prev) {
++                      if (other->lock == lock)
++                              nested = true;
++                      else {
++                              /*
++                               * Return the ticket if we now own the lock.
++                               * While just being desirable generally (to
++                               * reduce latency on other CPUs), this is
++                               * essential in the case where interrupts
++                               * get re-enabled below.
++                               * Try to get a new ticket right away (to
++                               * reduce latency after the current lock was
++                               * released), but don't acquire the lock.
++                               */
++                              arch_spinlock_t *lock = other->lock;
++
++                              arch_local_irq_disable();
++                              while (lock->cur == other->ticket) {
++                                      unsigned int token;
++                                      bool kick, free;
++
++                                      other->ticket = -1;
++                                      __ticket_spin_unlock_body;
++                                      if (!kick)
++                                              break;
++                                      xen_spin_kick(lock, token);
++                                      __ticket_spin_lock_preamble;
++                                      if (!free)
++                                              token = spin_adjust(
++                                                      other->prev, lock,
++                                                      token);
++                                      other->ticket = token >> TICKET_SHIFT;
++                                      smp_mb();
++                              }
++                      }
++              }
++
++              /*
++               * No need to use arch_local_irq_restore() here, as the
++               * intended event processing will happen with the poll
++               * call.
++               */
++              vcpu_info_write(evtchn_upcall_mask,
++                              nested ? upcall_mask : flags);
++
++              if (HYPERVISOR_poll_no_timeout(&__get_cpu_var(poll_evtchn), 1))
++                      BUG();
++
++              vcpu_info_write(evtchn_upcall_mask, upcall_mask);
++
++              rc = !test_evtchn(percpu_read(poll_evtchn));
++              if (!rc)
++                      inc_irq_stat(irq_lock_count);
++      } while (spinning.prev || rc);
++
++      /*
++       * Leave the irq pending so that any interrupted blocker will
++       * re-check.
++       */
++
++      /* announce we're done */
++      other = spinning.prev;
++      percpu_write(_spinning, other);
++      arch_local_irq_disable();
++      rm_idx = percpu_read(rm_seq.idx);
++      smp_wmb();
++      percpu_write(rm_seq.idx, rm_idx + 1);
++      mb();
++
++      /*
++       * Obtain new tickets for (or acquire) all those locks where
++       * above we avoided acquiring them.
++       */
++      if (other) {
++              do {
++                      unsigned int token;
++                      bool free;
++
++                      if (other->ticket + 1)
++                              continue;
++                      lock = other->lock;
++                      __ticket_spin_lock_preamble;
++                      if (!free)
++                              token = spin_adjust(other->prev, lock, token);
++                      other->ticket = token >> TICKET_SHIFT;
++                      if (lock->cur == other->ticket)
++                              lock->owner = cpu;
++              } while ((other = other->prev) != NULL);
++              lock = spinning.lock;
++      }
++
++      rm_idx &= 1;
++      while (percpu_read(rm_seq.ctr[rm_idx].counter))
++              cpu_relax();
++      arch_local_irq_restore(upcall_mask);
++      *ptok = lock->cur | (spinning.ticket << TICKET_SHIFT);
++
++      return rc ? 0 : __ticket_spin_count(lock);
++}
++
++void xen_spin_kick(arch_spinlock_t *lock, unsigned int token)
++{
++      unsigned int cpu = raw_smp_processor_id(), ancor = cpu;
++
++      if (unlikely(!cpu_online(cpu)))
++              cpu = -1, ancor = nr_cpu_ids;
++
++      token &= (1U << TICKET_SHIFT) - 1;
++      while ((cpu = cpumask_next(cpu, cpu_online_mask)) != ancor) {
++              unsigned int flags;
++              atomic_t *rm_ctr;
++              struct spinning *spinning;
++
++              if (cpu >= nr_cpu_ids) {
++                      if (ancor == nr_cpu_ids)
++                              return;
++                      cpu = cpumask_first(cpu_online_mask);
++                      if (cpu == ancor)
++                              return;
++              }
++
++              flags = arch_local_irq_save();
++              for (;;) {
++                      unsigned int rm_idx = per_cpu(rm_seq.idx, cpu);
++
++                      rm_ctr = per_cpu(rm_seq.ctr, cpu) + (rm_idx & 1);
++                      atomic_inc(rm_ctr);
++#ifdef CONFIG_X86 /* atomic ops are full barriers */
++                      barrier();
++#else
++                      smp_mb();
++#endif
++                      spinning = per_cpu(_spinning, cpu);
++                      smp_rmb();
++                      if (rm_idx == per_cpu(rm_seq.idx, cpu))
++                              break;
++                      atomic_dec(rm_ctr);
++              }
++
++              while (spinning) {
++                      if (spinning->lock == lock && spinning->ticket == token)
++                              break;
++                      spinning = spinning->prev;
++              }
++
++              atomic_dec(rm_ctr);
++              arch_local_irq_restore(flags);
++
++              if (unlikely(spinning)) {
++                      notify_remote_via_evtchn(per_cpu(poll_evtchn, cpu));
++                      return;
++              }
++      }
++}
++EXPORT_SYMBOL(xen_spin_kick);
++
++#endif /* TICKET_SHIFT */
diff --cc drivers/xen/core/xen_proc.c

index 0000000,0000000..fde63c1

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/xen_proc.c
@@@ -1,0 -1,0 +1,29 @@@
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/proc_fs.h>
++#include <xen/xen_proc.h>
++
++static struct proc_dir_entry *xen_base;
++
++struct proc_dir_entry *
++#ifndef MODULE
++__init
++#endif
++create_xen_proc_entry(const char *name, mode_t mode)
++{
++      if ( xen_base == NULL )
++              if ( (xen_base = proc_mkdir("xen", NULL)) == NULL )
++                      panic("Couldn't create /proc/xen");
++      return create_proc_entry(name, mode, xen_base);
++}
++
++#ifdef MODULE
++EXPORT_SYMBOL_GPL(create_xen_proc_entry); 
++#elif defined(CONFIG_XEN_PRIVILEGED_GUEST)
++
++void remove_xen_proc_entry(const char *name)
++{
++      remove_proc_entry(name, xen_base);
++}
++
++#endif
diff --cc drivers/xen/core/xen_sysfs.c

index 0000000,0000000..d84d7a4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/core/xen_sysfs.c
@@@ -1,0 -1,0 +1,422 @@@
++/*
++ *  copyright (c) 2006 IBM Corporation
++ *  Authored by: Mike D. Day <ncmike@us.ibm.com>
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License version 2 as
++ *  published by the Free Software Foundation.
++ */
++
++#include <linux/err.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <asm/hypervisor.h>
++#include <xen/features.h>
++#include <xen/hypervisor_sysfs.h>
++#include <xen/xenbus.h>
++#include <xen/interface/kexec.h>
++#include "../xenbus/xenbus_comms.h"
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Mike D. Day <ncmike@us.ibm.com>");
++
++static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      return sprintf(buffer, "xen\n");
++}
++
++HYPERVISOR_ATTR_RO(type);
++
++static int __init xen_sysfs_type_init(void)
++{
++      return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
++}
++
++static void xen_sysfs_type_destroy(void)
++{
++      sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
++}
++
++/* xen version attributes */
++static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      int version = HYPERVISOR_xen_version(XENVER_version, NULL);
++      if (version)
++              return sprintf(buffer, "%d\n", version >> 16);
++      return -ENODEV;
++}
++
++HYPERVISOR_ATTR_RO(major);
++
++static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      int version = HYPERVISOR_xen_version(XENVER_version, NULL);
++      if (version)
++              return sprintf(buffer, "%d\n", version & 0xff);
++      return -ENODEV;
++}
++
++HYPERVISOR_ATTR_RO(minor);
++
++static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      int ret = -ENOMEM;
++      char *extra;
++
++      extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
++      if (extra) {
++              ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
++              if (!ret)
++                      ret = sprintf(buffer, "%s\n", extra);
++              kfree(extra);
++      }
++
++      return ret;
++}
++
++HYPERVISOR_ATTR_RO(extra);
++
++static struct attribute *version_attrs[] = {
++      &major_attr.attr,
++      &minor_attr.attr,
++      &extra_attr.attr,
++      NULL
++};
++
++static struct attribute_group version_group = {
++      .name = "version",
++      .attrs = version_attrs,
++};
++
++static int __init xen_sysfs_version_init(void)
++{
++      return sysfs_create_group(hypervisor_kobj, &version_group);
++}
++
++static void xen_sysfs_version_destroy(void)
++{
++      sysfs_remove_group(hypervisor_kobj, &version_group);
++}
++
++/* UUID */
++
++static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      char *vm, *val;
++      int ret;
++
++      if (!is_xenstored_ready())
++              return -EBUSY;
++
++      vm = xenbus_read(XBT_NIL, "vm", "", NULL);
++      if (IS_ERR(vm))
++              return PTR_ERR(vm);
++      val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
++      kfree(vm);
++      if (IS_ERR(val))
++              return PTR_ERR(val);
++      ret = sprintf(buffer, "%s\n", val);
++      kfree(val);
++      return ret;
++}
++
++HYPERVISOR_ATTR_RO(uuid);
++
++static int __init xen_sysfs_uuid_init(void)
++{
++      return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
++}
++
++static void xen_sysfs_uuid_destroy(void)
++{
++      sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
++}
++
++/* xen compilation attributes */
++
++static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      int ret = -ENOMEM;
++      struct xen_compile_info *info;
++
++      info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
++      if (info) {
++              ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
++              if (!ret)
++                      ret = sprintf(buffer, "%s\n", info->compiler);
++              kfree(info);
++      }
++
++      return ret;
++}
++
++HYPERVISOR_ATTR_RO(compiler);
++
++static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      int ret = -ENOMEM;
++      struct xen_compile_info *info;
++
++      info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
++      if (info) {
++              ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
++              if (!ret)
++                      ret = sprintf(buffer, "%s\n", info->compile_by);
++              kfree(info);
++      }
++
++      return ret;
++}
++
++HYPERVISOR_ATTR_RO(compiled_by);
++
++static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      int ret = -ENOMEM;
++      struct xen_compile_info *info;
++
++      info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
++      if (info) {
++              ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
++              if (!ret)
++                      ret = sprintf(buffer, "%s\n", info->compile_date);
++              kfree(info);
++      }
++
++      return ret;
++}
++
++HYPERVISOR_ATTR_RO(compile_date);
++
++static struct attribute *xen_compile_attrs[] = {
++      &compiler_attr.attr,
++      &compiled_by_attr.attr,
++      &compile_date_attr.attr,
++      NULL
++};
++
++static struct attribute_group xen_compilation_group = {
++      .name = "compilation",
++      .attrs = xen_compile_attrs,
++};
++
++int __init static xen_compilation_init(void)
++{
++      return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
++}
++
++static void xen_compilation_destroy(void)
++{
++      sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
++}
++
++/* xen properties info */
++
++static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      int ret = -ENOMEM;
++      char *caps;
++
++      caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
++      if (caps) {
++              ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
++              if (!ret)
++                      ret = sprintf(buffer, "%s\n", caps);
++              kfree(caps);
++      }
++
++      return ret;
++}
++
++HYPERVISOR_ATTR_RO(capabilities);
++
++static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      int ret = -ENOMEM;
++      char *cset;
++
++      cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
++      if (cset) {
++              ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
++              if (!ret)
++                      ret = sprintf(buffer, "%s\n", cset);
++              kfree(cset);
++      }
++
++      return ret;
++}
++
++HYPERVISOR_ATTR_RO(changeset);
++
++static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      int ret = -ENOMEM;
++      struct xen_platform_parameters *parms;
++
++      parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
++      if (parms) {
++              ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
++                                           parms);
++              if (!ret)
++                      ret = sprintf(buffer, "%lx\n", parms->virt_start);
++              kfree(parms);
++      }
++
++      return ret;
++}
++
++HYPERVISOR_ATTR_RO(virtual_start);
++
++static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      int ret;
++
++      ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
++      if (ret > 0)
++              ret = sprintf(buffer, "%x\n", ret);
++
++      return ret;
++}
++
++HYPERVISOR_ATTR_RO(pagesize);
++
++/* eventually there will be several more features to export */
++static ssize_t xen_feature_show(int index, char *buffer)
++{
++      int ret = -ENOMEM;
++      struct xen_feature_info *info;
++
++      info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
++      if (info) {
++              info->submap_idx = index;
++              ret = HYPERVISOR_xen_version(XENVER_get_features, info);
++              if (!ret)
++                      ret = sprintf(buffer, "%d\n", info->submap);
++              kfree(info);
++      }
++
++      return ret;
++}
++
++static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
++{
++      return xen_feature_show(XENFEAT_writable_page_tables, buffer);
++}
++
++HYPERVISOR_ATTR_RO(writable_pt);
++
++static struct attribute *xen_properties_attrs[] = {
++      &capabilities_attr.attr,
++      &changeset_attr.attr,
++      &virtual_start_attr.attr,
++      &pagesize_attr.attr,
++      &writable_pt_attr.attr,
++      NULL
++};
++
++static struct attribute_group xen_properties_group = {
++      .name = "properties",
++      .attrs = xen_properties_attrs,
++};
++
++static int __init xen_properties_init(void)
++{
++      return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
++}
++
++static void xen_properties_destroy(void)
++{
++      sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
++}
++
++#ifdef CONFIG_KEXEC
++
++extern size_t vmcoreinfo_size_xen;
++extern unsigned long paddr_vmcoreinfo_xen;
++
++static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page)
++{
++      return sprintf(page, "%lx %zx\n",
++              paddr_vmcoreinfo_xen, vmcoreinfo_size_xen);
++}
++
++HYPERVISOR_ATTR_RO(vmcoreinfo);
++
++static int __init xen_sysfs_vmcoreinfo_init(void)
++{
++      return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
++}
++
++static void xen_sysfs_vmcoreinfo_destroy(void)
++{
++      sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
++}
++
++#endif
++
++static int __init hyper_sysfs_init(void)
++{
++      int ret;
++
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      ret = xen_sysfs_type_init();
++      if (ret)
++              goto out;
++      ret = xen_sysfs_version_init();
++      if (ret)
++              goto version_out;
++      ret = xen_compilation_init();
++      if (ret)
++              goto comp_out;
++      ret = xen_sysfs_uuid_init();
++      if (ret)
++              goto uuid_out;
++      ret = xen_properties_init();
++      if (ret)
++              goto prop_out;
++#ifdef CONFIG_KEXEC
++      if (vmcoreinfo_size_xen != 0) {
++              ret = xen_sysfs_vmcoreinfo_init();
++              if (ret)
++                      goto vmcoreinfo_out;
++      }
++#endif
++
++      goto out;
++
++#ifdef CONFIG_KEXEC
++vmcoreinfo_out:
++#endif
++      xen_properties_destroy();
++prop_out:
++      xen_sysfs_uuid_destroy();
++uuid_out:
++      xen_compilation_destroy();
++comp_out:
++      xen_sysfs_version_destroy();
++version_out:
++      xen_sysfs_type_destroy();
++out:
++      return ret;
++}
++
++static void __exit hyper_sysfs_exit(void)
++{
++#ifdef CONFIG_KEXEC
++      if (vmcoreinfo_size_xen != 0)
++              xen_sysfs_vmcoreinfo_destroy();
++#endif
++      xen_properties_destroy();
++      xen_compilation_destroy();
++      xen_sysfs_uuid_destroy();
++      xen_sysfs_version_destroy();
++      xen_sysfs_type_destroy();
++
++}
++
++module_init(hyper_sysfs_init);
++module_exit(hyper_sysfs_exit);
diff --cc drivers/xen/evtchn.c

index dbc13e9,ef11daf..8d27a3d
--- 1/drivers/xen/evtchn.c
--- 2/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@@ -49,9 -49,9 +49,15 @@@
   #include <linux/cpu.h>
   
   #include <xen/xen.h>
++#ifdef CONFIG_PARAVIRT_XEN
   #include <xen/events.h>
   #include <xen/evtchn.h>
   #include <asm/xen/hypervisor.h>
++#else
++#include <xen/evtchn.h>
++#include <xen/public/evtchn.h>
++#define bind_evtchn_to_irqhandler bind_caller_port_to_irqhandler
++#endif
   
   struct per_user_data {
         struct mutex bind_mutex; /* serialize bind/unbind operations */
@@@ -278,6 -278,6 +284,9 @@@ static void evtchn_unbind_from_user(str
         int irq = irq_from_evtchn(port);
   
         unbind_from_irqhandler(irq, (void *)(unsigned long)port);
++#ifdef CONFIG_XEN
++      WARN_ON(close_evtchn(port));
++#endif
   
         set_port_user(port, NULL);
   }
@@@ -450,7 -450,7 +459,8 @@@ static int evtchn_open(struct inode *in
         if (u == NULL)
                 return -ENOMEM;
   
--      u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm);
++      u->name = kasprintf(GFP_KERNEL, "evtchn:%s[%d]",
++                          current->comm, current->pid);
         if (u->name == NULL) {
                 kfree(u);
                 return -ENOMEM;
@@@ -518,7 -518,7 +528,12 @@@ static const struct file_operations evt
   
   static struct miscdevice evtchn_miscdev = {
         .minor        = MISC_DYNAMIC_MINOR,
++#ifdef CONFIG_PARAVIRT_XEN
         .name         = "xen/evtchn",
++#else
++      .name         = "evtchn",
++#endif
++      .nodename     = "xen/evtchn",
         .fops         = &evtchn_fops,
   };
   static int __init evtchn_init(void)
@@@ -534,10 -534,10 +549,10 @@@
   
         spin_lock_init(&port_user_lock);
   
--      /* Create '/dev/misc/evtchn'. */
++      /* Create '/dev/xen/evtchn'. */
         err = misc_register(&evtchn_miscdev);
         if (err != 0) {
--              printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
++              pr_alert("Could not register /dev/xen/evtchn\n");
                 return err;
         }
   
@@@ -558,3 -558,3 +573,4 @@@ module_init(evtchn_init)
   module_exit(evtchn_cleanup);
   
   MODULE_LICENSE("GPL");
++MODULE_ALIAS("devname:xen/evtchn");
diff --cc drivers/xen/fbfront/Makefile

index 0000000,0000000..e2b8909

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/fbfront/Makefile
@@@ -1,0 -1,0 +1,2 @@@
++obj-$(CONFIG_XEN_FRAMEBUFFER) := xenfb.o
++obj-$(CONFIG_XEN_KEYBOARD)    += xenkbd.o
diff --cc drivers/xen/fbfront/xenfb.c

index 0000000,0000000..e2b8ca2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/fbfront/xenfb.c
@@@ -1,0 -1,0 +1,912 @@@
++/*
++ * linux/drivers/video/xenfb.c -- Xen para-virtual frame buffer device
++ *
++ * Copyright (C) 2005-2006 Anthony Liguori <aliguori@us.ibm.com>
++ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
++ *
++ *  Based on linux/drivers/video/q40fb.c
++ *
++ *  This file is subject to the terms and conditions of the GNU General Public
++ *  License. See the file COPYING in the main directory of this archive for
++ *  more details.
++ */
++
++/*
++ * TODO:
++ *
++ * Switch to grant tables when they become capable of dealing with the
++ * frame buffer.
++ */
++
++#include <linux/console.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/fb.h>
++#include <linux/module.h>
++#include <linux/vmalloc.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <linux/mutex.h>
++#include <linux/freezer.h>
++#include <asm/hypervisor.h>
++#include <xen/evtchn.h>
++#include <xen/interface/io/fbif.h>
++#include <xen/interface/io/protocols.h>
++#include <xen/xenbus.h>
++#include <linux/kthread.h>
++
++struct xenfb_mapping
++{
++      struct list_head        link;
++      struct vm_area_struct   *vma;
++      atomic_t                map_refs;
++      int                     faults;
++      struct xenfb_info       *info;
++};
++
++struct xenfb_info
++{
++      struct task_struct      *kthread;
++      wait_queue_head_t       wq;
++
++      unsigned char           *fb;
++      struct fb_info          *fb_info;
++      struct timer_list       refresh;
++      int                     dirty;
++      int                     x1, y1, x2, y2; /* dirty rectangle,
++                                                 protected by dirty_lock */
++      spinlock_t              dirty_lock;
++      struct mutex            mm_lock;
++      int                     nr_pages;
++      struct page             **pages;
++      struct list_head        mappings; /* protected by mm_lock */
++
++      int                     irq;
++      struct xenfb_page       *page;
++      unsigned long           *mfns;
++      int                     feature_resize; /* Backend has resize feature */
++      struct xenfb_resize     resize;
++      int                     resize_dpy;
++      spinlock_t              resize_lock;
++
++      struct xenbus_device    *xbdev;
++};
++
++/*
++ * There are three locks:
++ *    spinlock resize_lock protecting resize_dpy and resize
++ *    spinlock dirty_lock protecting the dirty rectangle
++ *    mutex mm_lock protecting mappings.
++ *
++ * How the dirty and mapping locks work together
++ *
++ * The problem is that dirty rectangle and mappings aren't
++ * independent: the dirty rectangle must cover all faulted pages in
++ * mappings.  We need to prove that our locking maintains this
++ * invariant.
++ *
++ * There are several kinds of critical regions:
++ *
++ * 1. Holding only dirty_lock: xenfb_refresh().  May run in
++ *    interrupts.  Extends the dirty rectangle.  Trivially preserves
++ *    invariant.
++ *
++ * 2. Holding only mm_lock: xenfb_mmap() and xenfb_vm_close().  Touch
++ *    only mappings.  The former creates unfaulted pages.  Preserves
++ *    invariant.  The latter removes pages.  Preserves invariant.
++ *
++ * 3. Holding both locks: xenfb_vm_fault().  Extends the dirty
++ *    rectangle and updates mappings consistently.  Preserves
++ *    invariant.
++ *
++ * 4. The ugliest one: xenfb_update_screen().  Clear the dirty
++ *    rectangle and update mappings consistently.
++ *
++ *    We can't simply hold both locks, because zap_page_range() cannot
++ *    be called with a spinlock held.
++ *
++ *    Therefore, we first clear the dirty rectangle with both locks
++ *    held.  Then we unlock dirty_lock and update the mappings.
++ *    Critical regions that hold only dirty_lock may interfere with
++ *    that.  This can only be region 1: xenfb_refresh().  But that
++ *    just extends the dirty rectangle, which can't harm the
++ *    invariant.
++ *
++ * But FIXME: the invariant is too weak.  It misses that the fault
++ * record in mappings must be consistent with the mapping of pages in
++ * the associated address space!  __do_fault() updates the PTE after
++ * xenfb_vm_fault() returns, i.e. outside the critical region.  This
++ * allows the following race:
++ *
++ * X writes to some address in the Xen frame buffer
++ * Fault - call __do_fault()
++ *     call xenfb_vm_fault()
++ *         grab mm_lock
++ *         map->faults++;
++ *         release mm_lock
++ *     return back to do_no_page()
++ * (preempted, or SMP)
++ * Xen worker thread runs.
++ *      grab mm_lock
++ *      look at mappings
++ *          find this mapping, zaps its pages (but page not in pte yet)
++ *          clear map->faults
++ *      releases mm_lock
++ * (back to X process)
++ *     put page in X's pte
++ *
++ * Oh well, we wont be updating the writes to this page anytime soon.
++ */
++#define MB_ (1024*1024)
++#define XENFB_DEFAULT_FB_LEN (XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8)
++
++enum {KPARAM_MEM, KPARAM_WIDTH, KPARAM_HEIGHT, KPARAM_CNT};
++static int video[KPARAM_CNT] = {2, XENFB_WIDTH, XENFB_HEIGHT};
++module_param_array(video, int, NULL, 0);
++MODULE_PARM_DESC(video,
++              "Size of video memory in MB and width,height in pixels, default = (2,800,600)");
++
++static int xenfb_fps = 20;
++
++static int xenfb_remove(struct xenbus_device *);
++static void xenfb_init_shared_page(struct xenfb_info *, struct fb_info *);
++static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *);
++static void xenfb_disconnect_backend(struct xenfb_info *);
++
++static void xenfb_send_event(struct xenfb_info *info,
++              union xenfb_out_event *event)
++{
++      __u32 prod;
++
++      prod = info->page->out_prod;
++      /* caller ensures !xenfb_queue_full() */
++      mb();                   /* ensure ring space available */
++      XENFB_OUT_RING_REF(info->page, prod) = *event;
++      wmb();                  /* ensure ring contents visible */
++      info->page->out_prod = prod + 1;
++
++      notify_remote_via_irq(info->irq);
++}
++
++static void xenfb_do_update(struct xenfb_info *info,
++                          int x, int y, int w, int h)
++{
++      union xenfb_out_event event;
++
++      memset(&event, 0, sizeof(event));
++      event.type = XENFB_TYPE_UPDATE;
++      event.update.x = x;
++      event.update.y = y;
++      event.update.width = w;
++      event.update.height = h;
++
++      /* caller ensures !xenfb_queue_full() */
++      xenfb_send_event(info, &event);
++}
++
++static void xenfb_do_resize(struct xenfb_info *info)
++{
++      union xenfb_out_event event;
++
++      memset(&event, 0, sizeof(event));
++      event.resize = info->resize;
++
++      /* caller ensures !xenfb_queue_full() */
++      xenfb_send_event(info, &event);
++}
++
++static int xenfb_queue_full(struct xenfb_info *info)
++{
++      __u32 cons, prod;
++
++      prod = info->page->out_prod;
++      cons = info->page->out_cons;
++      return prod - cons == XENFB_OUT_RING_LEN;
++}
++
++static void xenfb_update_screen(struct xenfb_info *info)
++{
++      unsigned long flags;
++      int y1, y2, x1, x2;
++      struct xenfb_mapping *map;
++
++      if (xenfb_queue_full(info))
++              return;
++
++      mutex_lock(&info->mm_lock);
++
++      spin_lock_irqsave(&info->dirty_lock, flags);
++      if (info->dirty){
++              info->dirty = 0;
++              y1 = info->y1;
++              y2 = info->y2;
++              x1 = info->x1;
++              x2 = info->x2;
++              info->x1 = info->y1 = INT_MAX;
++              info->x2 = info->y2 = 0;
++      } else {
++              spin_unlock_irqrestore(&info->dirty_lock, flags);
++              mutex_unlock(&info->mm_lock);
++              return;
++      }
++      spin_unlock_irqrestore(&info->dirty_lock, flags);
++
++      list_for_each_entry(map, &info->mappings, link) {
++              if (!map->faults)
++                      continue;
++              zap_page_range(map->vma, map->vma->vm_start,
++                             map->vma->vm_end - map->vma->vm_start, NULL);
++              map->faults = 0;
++      }
++
++      mutex_unlock(&info->mm_lock);
++
++      if (x2 < x1 || y2 < y1) {
++              pr_warning("xenfb_update_screen bogus rect %d %d %d %d\n",
++                         x1, x2, y1, y2);
++              WARN_ON(1);
++      }
++      xenfb_do_update(info, x1, y1, x2 - x1, y2 - y1);
++}
++
++static void xenfb_handle_resize_dpy(struct xenfb_info *info)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&info->resize_lock, flags);
++      if (info->resize_dpy) {
++              if (!xenfb_queue_full(info)) {
++                      info->resize_dpy = 0;
++                      xenfb_do_resize(info);
++              }
++      }
++      spin_unlock_irqrestore(&info->resize_lock, flags);
++}
++
++static int xenfb_thread(void *data)
++{
++      struct xenfb_info *info = data;
++
++      while (!kthread_should_stop()) {
++              xenfb_handle_resize_dpy(info);
++              xenfb_update_screen(info);
++              wait_event_interruptible(info->wq,
++                      kthread_should_stop() || info->dirty);
++              try_to_freeze();
++      }
++      return 0;
++}
++
++static int xenfb_setcolreg(unsigned regno, unsigned red, unsigned green,
++                         unsigned blue, unsigned transp,
++                         struct fb_info *info)
++{
++      u32 v;
++
++      if (regno > info->cmap.len)
++              return 1;
++
++      red   >>= (16 - info->var.red.length);
++      green >>= (16 - info->var.green.length);
++      blue  >>= (16 - info->var.blue.length);
++
++      v = (red << info->var.red.offset) |
++          (green << info->var.green.offset) |
++          (blue << info->var.blue.offset);
++
++      /* FIXME is this sane?  check against xxxfb_setcolreg()!  */
++      switch (info->var.bits_per_pixel) {
++      case 16:
++      case 24:
++      case 32:
++              ((u32 *)info->pseudo_palette)[regno] = v;
++              break;
++      }
++      
++      return 0;
++}
++
++static void xenfb_timer(unsigned long data)
++{
++      struct xenfb_info *info = (struct xenfb_info *)data;
++      wake_up(&info->wq);
++}
++
++static void __xenfb_refresh(struct xenfb_info *info,
++                          int x1, int y1, int w, int h)
++{
++      int y2, x2;
++
++      y2 = y1 + h;
++      x2 = x1 + w;
++
++      if (info->y1 > y1)
++              info->y1 = y1;
++      if (info->y2 < y2)
++              info->y2 = y2;
++      if (info->x1 > x1)
++              info->x1 = x1;
++      if (info->x2 < x2)
++              info->x2 = x2;
++      info->dirty = 1;
++
++      if (timer_pending(&info->refresh))
++              return;
++
++      mod_timer(&info->refresh, jiffies + HZ/xenfb_fps);
++}
++
++static void xenfb_refresh(struct xenfb_info *info,
++                        int x1, int y1, int w, int h)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&info->dirty_lock, flags);
++      __xenfb_refresh(info, x1, y1, w, h);
++      spin_unlock_irqrestore(&info->dirty_lock, flags);
++}
++
++static void xenfb_fillrect(struct fb_info *p, const struct fb_fillrect *rect)
++{
++      struct xenfb_info *info = p->par;
++
++      cfb_fillrect(p, rect);
++      xenfb_refresh(info, rect->dx, rect->dy, rect->width, rect->height);
++}
++
++static void xenfb_imageblit(struct fb_info *p, const struct fb_image *image)
++{
++      struct xenfb_info *info = p->par;
++
++      cfb_imageblit(p, image);
++      xenfb_refresh(info, image->dx, image->dy, image->width, image->height);
++}
++
++static void xenfb_copyarea(struct fb_info *p, const struct fb_copyarea *area)
++{
++      struct xenfb_info *info = p->par;
++
++      cfb_copyarea(p, area);
++      xenfb_refresh(info, area->dx, area->dy, area->width, area->height);
++}
++
++static void xenfb_vm_open(struct vm_area_struct *vma)
++{
++      struct xenfb_mapping *map = vma->vm_private_data;
++      atomic_inc(&map->map_refs);
++}
++
++static void xenfb_vm_close(struct vm_area_struct *vma)
++{
++      struct xenfb_mapping *map = vma->vm_private_data;
++      struct xenfb_info *info = map->info;
++
++      mutex_lock(&info->mm_lock);
++      if (atomic_dec_and_test(&map->map_refs)) {
++              list_del(&map->link);
++              kfree(map);
++      }
++      mutex_unlock(&info->mm_lock);
++}
++
++static int xenfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++      struct xenfb_mapping *map = vma->vm_private_data;
++      struct xenfb_info *info = map->info;
++      int pgnr = ((long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
++      unsigned long flags;
++      struct page *page;
++      int y1, y2;
++
++      if (pgnr >= info->nr_pages)
++              return VM_FAULT_SIGBUS;
++
++      mutex_lock(&info->mm_lock);
++      spin_lock_irqsave(&info->dirty_lock, flags);
++      page = info->pages[pgnr];
++      get_page(page);
++      map->faults++;
++
++      y1 = pgnr * PAGE_SIZE / info->fb_info->fix.line_length;
++      y2 = (pgnr * PAGE_SIZE + PAGE_SIZE - 1) / info->fb_info->fix.line_length;
++      if (y2 > info->fb_info->var.yres)
++              y2 = info->fb_info->var.yres;
++      __xenfb_refresh(info, 0, y1, info->fb_info->var.xres, y2 - y1);
++      spin_unlock_irqrestore(&info->dirty_lock, flags);
++      mutex_unlock(&info->mm_lock);
++
++      vmf->page = page;
++
++      return VM_FAULT_MINOR;
++}
++
++static struct vm_operations_struct xenfb_vm_ops = {
++      .open   = xenfb_vm_open,
++      .close  = xenfb_vm_close,
++      .fault  = xenfb_vm_fault,
++};
++
++static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
++{
++      struct xenfb_info *info = fb_info->par;
++      struct xenfb_mapping *map;
++      int map_pages;
++
++      if (!(vma->vm_flags & VM_WRITE))
++              return -EINVAL;
++      if (!(vma->vm_flags & VM_SHARED))
++              return -EINVAL;
++      if (vma->vm_pgoff != 0)
++              return -EINVAL;
++
++      map_pages = (vma->vm_end - vma->vm_start + PAGE_SIZE-1) >> PAGE_SHIFT;
++      if (map_pages > info->nr_pages)
++              return -EINVAL;
++
++      map = kzalloc(sizeof(*map), GFP_KERNEL);
++      if (map == NULL)
++              return -ENOMEM;
++
++      map->vma = vma;
++      map->faults = 0;
++      map->info = info;
++      atomic_set(&map->map_refs, 1);
++
++      mutex_lock(&info->mm_lock);
++      list_add(&map->link, &info->mappings);
++      mutex_unlock(&info->mm_lock);
++
++      vma->vm_ops = &xenfb_vm_ops;
++      vma->vm_flags |= (VM_DONTEXPAND | VM_RESERVED);
++      vma->vm_private_data = map;
++
++      return 0;
++}
++
++static int
++xenfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
++{
++      struct xenfb_info *xenfb_info;
++      int required_mem_len;
++
++      xenfb_info = info->par;
++
++      if (!xenfb_info->feature_resize) {
++              if (var->xres == video[KPARAM_WIDTH] &&
++                      var->yres == video[KPARAM_HEIGHT] &&
++                      var->bits_per_pixel == xenfb_info->page->depth) {
++                      return 0;
++              }
++              return -EINVAL;
++      }
++
++      /* Can't resize past initial width and height */
++      if (var->xres > video[KPARAM_WIDTH] || var->yres > video[KPARAM_HEIGHT])
++              return -EINVAL;
++
++      required_mem_len = var->xres * var->yres * (xenfb_info->page->depth / 8);
++      if (var->bits_per_pixel == xenfb_info->page->depth &&
++              var->xres <= info->fix.line_length / (XENFB_DEPTH / 8) &&
++              required_mem_len <= info->fix.smem_len) {
++              var->xres_virtual = var->xres;
++              var->yres_virtual = var->yres;
++              return 0;
++      }
++      return -EINVAL;
++}
++
++static int xenfb_set_par(struct fb_info *info)
++{
++      struct xenfb_info *xenfb_info;
++      unsigned long flags;
++
++      xenfb_info = info->par;
++
++      spin_lock_irqsave(&xenfb_info->resize_lock, flags);
++      xenfb_info->resize.type = XENFB_TYPE_RESIZE;
++      xenfb_info->resize.width = info->var.xres;
++      xenfb_info->resize.height = info->var.yres;
++      xenfb_info->resize.stride = info->fix.line_length;
++      xenfb_info->resize.depth = info->var.bits_per_pixel;
++      xenfb_info->resize.offset = 0;
++      xenfb_info->resize_dpy = 1;
++      spin_unlock_irqrestore(&xenfb_info->resize_lock, flags);
++      return 0;
++}
++
++static struct fb_ops xenfb_fb_ops = {
++      .owner          = THIS_MODULE,
++      .fb_setcolreg   = xenfb_setcolreg,
++      .fb_fillrect    = xenfb_fillrect,
++      .fb_copyarea    = xenfb_copyarea,
++      .fb_imageblit   = xenfb_imageblit,
++      .fb_mmap        = xenfb_mmap,
++      .fb_check_var   = xenfb_check_var,
++      .fb_set_par     = xenfb_set_par,
++};
++
++static irqreturn_t xenfb_event_handler(int rq, void *dev_id)
++{
++      /*
++       * No in events recognized, simply ignore them all.
++       * If you need to recognize some, see xenbkd's input_handler()
++       * for how to do that.
++       */
++      struct xenfb_info *info = dev_id;
++      struct xenfb_page *page = info->page;
++
++      if (page->in_cons != page->in_prod) {
++              info->page->in_cons = info->page->in_prod;
++              notify_remote_via_irq(info->irq);
++      }
++      return IRQ_HANDLED;
++}
++
++static unsigned long vmalloc_to_mfn(void *address)
++{
++      return pfn_to_mfn(vmalloc_to_pfn(address));
++}
++
++static __devinit void
++xenfb_make_preferred_console(void)
++{
++      struct console *c;
++
++      if (console_set_on_cmdline)
++              return;
++
++      console_lock();
++      for_each_console(c) {
++              if (!strcmp(c->name, "tty") && c->index == 0)
++                      break;
++      }
++      console_unlock();
++      if (c) {
++              unregister_console(c);
++              c->flags |= CON_CONSDEV;
++              c->flags &= ~CON_PRINTBUFFER; /* don't print again */
++              register_console(c);
++      }
++}
++
++static int __devinit xenfb_probe(struct xenbus_device *dev,
++                               const struct xenbus_device_id *id)
++{
++      struct xenfb_info *info;
++      struct fb_info *fb_info;
++      int fb_size;
++      int val;
++      int ret;
++
++      info = kzalloc(sizeof(*info), GFP_KERNEL);
++      if (info == NULL) {
++              xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
++              return -ENOMEM;
++      }
++
++      /* Limit kernel param videoram amount to what is in xenstore */
++      if (xenbus_scanf(XBT_NIL, dev->otherend, "videoram", "%d", &val) == 1) {
++              if (val < video[KPARAM_MEM])
++                      video[KPARAM_MEM] = val;
++      }
++
++      /* If requested res does not fit in available memory, use default */
++      fb_size = video[KPARAM_MEM] * MB_;
++      if (video[KPARAM_WIDTH] * video[KPARAM_HEIGHT] * XENFB_DEPTH/8 > fb_size) {
++              video[KPARAM_WIDTH] = XENFB_WIDTH;
++              video[KPARAM_HEIGHT] = XENFB_HEIGHT;
++              fb_size = XENFB_DEFAULT_FB_LEN;
++      }
++
++      dev_set_drvdata(&dev->dev, info);
++      info->xbdev = dev;
++      info->irq = -1;
++      info->x1 = info->y1 = INT_MAX;
++      spin_lock_init(&info->dirty_lock);
++      spin_lock_init(&info->resize_lock);
++      mutex_init(&info->mm_lock);
++      init_waitqueue_head(&info->wq);
++      init_timer(&info->refresh);
++      info->refresh.function = xenfb_timer;
++      info->refresh.data = (unsigned long)info;
++      INIT_LIST_HEAD(&info->mappings);
++
++      info->fb = vzalloc(fb_size);
++      if (info->fb == NULL)
++              goto error_nomem;
++
++      info->nr_pages = (fb_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
++
++      info->pages = kmalloc(sizeof(struct page *) * info->nr_pages,
++                            GFP_KERNEL);
++      if (info->pages == NULL)
++              goto error_nomem;
++
++      info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages);
++      if (!info->mfns)
++              goto error_nomem;
++
++      /* set up shared page */
++      info->page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
++      if (!info->page)
++              goto error_nomem;
++
++      fb_info = framebuffer_alloc(sizeof(u32) * 256, NULL);
++                              /* see fishy hackery below */
++      if (fb_info == NULL)
++              goto error_nomem;
++
++      /* FIXME fishy hackery */
++      fb_info->pseudo_palette = fb_info->par;
++      fb_info->par = info;
++      /* /FIXME */
++      fb_info->screen_base = info->fb;
++
++      fb_info->fbops = &xenfb_fb_ops;
++      fb_info->var.xres_virtual = fb_info->var.xres = video[KPARAM_WIDTH];
++      fb_info->var.yres_virtual = fb_info->var.yres = video[KPARAM_HEIGHT];
++      fb_info->var.bits_per_pixel = XENFB_DEPTH;
++
++      fb_info->var.red = (struct fb_bitfield){16, 8, 0};
++      fb_info->var.green = (struct fb_bitfield){8, 8, 0};
++      fb_info->var.blue = (struct fb_bitfield){0, 8, 0};
++
++      fb_info->var.activate = FB_ACTIVATE_NOW;
++      fb_info->var.height = -1;
++      fb_info->var.width = -1;
++      fb_info->var.vmode = FB_VMODE_NONINTERLACED;
++
++      fb_info->fix.visual = FB_VISUAL_TRUECOLOR;
++      fb_info->fix.line_length = fb_info->var.xres * (XENFB_DEPTH / 8);
++      fb_info->fix.smem_start = 0;
++      fb_info->fix.smem_len = fb_size;
++      strcpy(fb_info->fix.id, "xen");
++      fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
++      fb_info->fix.accel = FB_ACCEL_NONE;
++
++      fb_info->flags = FBINFO_FLAG_DEFAULT;
++
++      ret = fb_alloc_cmap(&fb_info->cmap, 256, 0);
++      if (ret < 0) {
++              framebuffer_release(fb_info);
++              xenbus_dev_fatal(dev, ret, "fb_alloc_cmap");
++              goto error;
++      }
++
++      xenfb_init_shared_page(info, fb_info);
++
++      ret = register_framebuffer(fb_info);
++      if (ret) {
++              fb_dealloc_cmap(&info->fb_info->cmap);
++              framebuffer_release(fb_info);
++              xenbus_dev_fatal(dev, ret, "register_framebuffer");
++              goto error;
++      }
++      info->fb_info = fb_info;
++
++      ret = xenfb_connect_backend(dev, info);
++      if (ret < 0)
++              goto error;
++
++      xenfb_make_preferred_console();
++      return 0;
++
++ error_nomem:
++      ret = -ENOMEM;
++      xenbus_dev_fatal(dev, ret, "allocating device memory");
++ error:
++      xenfb_remove(dev);
++      return ret;
++}
++
++static int xenfb_resume(struct xenbus_device *dev)
++{
++      struct xenfb_info *info = dev_get_drvdata(&dev->dev);
++
++      xenfb_disconnect_backend(info);
++      xenfb_init_shared_page(info, info->fb_info);
++      return xenfb_connect_backend(dev, info);
++}
++
++static int xenfb_remove(struct xenbus_device *dev)
++{
++      struct xenfb_info *info = dev_get_drvdata(&dev->dev);
++
++      del_timer(&info->refresh);
++      if (info->kthread)
++              kthread_stop(info->kthread);
++      xenfb_disconnect_backend(info);
++      if (info->fb_info) {
++              unregister_framebuffer(info->fb_info);
++              fb_dealloc_cmap(&info->fb_info->cmap);
++              framebuffer_release(info->fb_info);
++      }
++      free_page((unsigned long)info->page);
++      vfree(info->mfns);
++      kfree(info->pages);
++      vfree(info->fb);
++      kfree(info);
++
++      return 0;
++}
++
++static void xenfb_init_shared_page(struct xenfb_info *info,
++                                   struct fb_info * fb_info)
++{
++      int i;
++      int epd = PAGE_SIZE / sizeof(info->mfns[0]);
++
++      for (i = 0; i < info->nr_pages; i++)
++              info->pages[i] = vmalloc_to_page(info->fb + i * PAGE_SIZE);
++
++      for (i = 0; i < info->nr_pages; i++)
++              info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE);
++
++      for (i = 0; i * epd < info->nr_pages; i++)
++              info->page->pd[i] = vmalloc_to_mfn(&info->mfns[i * epd]);
++
++      info->page->width = fb_info->var.xres;
++      info->page->height = fb_info->var.yres;
++      info->page->depth = fb_info->var.bits_per_pixel;
++      info->page->line_length = fb_info->fix.line_length;
++      info->page->mem_length = fb_info->fix.smem_len;
++      info->page->in_cons = info->page->in_prod = 0;
++      info->page->out_cons = info->page->out_prod = 0;
++}
++
++static int xenfb_connect_backend(struct xenbus_device *dev,
++                               struct xenfb_info *info)
++{
++      int ret, irq;
++      struct xenbus_transaction xbt;
++
++      irq = bind_listening_port_to_irqhandler(
++              dev->otherend_id, xenfb_event_handler, 0, "xenfb", info);
++      if (irq < 0) {
++              xenbus_dev_fatal(dev, irq,
++                               "bind_listening_port_to_irqhandler");
++              return irq;
++      }
++
++ again:
++      ret = xenbus_transaction_start(&xbt);
++      if (ret) {
++              xenbus_dev_fatal(dev, ret, "starting transaction");
++              goto unbind_irq;
++      }
++      ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
++                          virt_to_mfn(info->page));
++      if (ret)
++              goto error_xenbus;
++      ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
++                          irq_to_evtchn_port(irq));
++      if (ret)
++              goto error_xenbus;
++      ret = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
++                          XEN_IO_PROTO_ABI_NATIVE);
++      if (ret)
++              goto error_xenbus;
++      ret = xenbus_printf(xbt, dev->nodename, "feature-update", "1");
++      if (ret)
++              goto error_xenbus;
++      ret = xenbus_transaction_end(xbt, 0);
++      if (ret) {
++              if (ret == -EAGAIN)
++                      goto again;
++              xenbus_dev_fatal(dev, ret, "completing transaction");
++              goto unbind_irq;
++      }
++
++      info->irq = irq;
++      xenbus_switch_state(dev, XenbusStateInitialised);
++      return 0;
++
++ error_xenbus:
++      xenbus_transaction_end(xbt, 1);
++      xenbus_dev_fatal(dev, ret, "writing xenstore");
++ unbind_irq:
++      unbind_from_irqhandler(irq, info);
++      return ret;
++}
++
++static void xenfb_disconnect_backend(struct xenfb_info *info)
++{
++      if (info->irq >= 0)
++              unbind_from_irqhandler(info->irq, info);
++      info->irq = -1;
++}
++
++static void xenfb_backend_changed(struct xenbus_device *dev,
++                                enum xenbus_state backend_state)
++{
++      struct xenfb_info *info = dev_get_drvdata(&dev->dev);
++      int val;
++
++      switch (backend_state) {
++      case XenbusStateInitialising:
++      case XenbusStateInitialised:
++      case XenbusStateReconfiguring:
++      case XenbusStateReconfigured:
++      case XenbusStateUnknown:
++      case XenbusStateClosed:
++              break;
++
++      case XenbusStateInitWait:
++      InitWait:
++              xenbus_switch_state(dev, XenbusStateConnected);
++              break;
++
++      case XenbusStateConnected:
++              /*
++               * Work around xenbus race condition: If backend goes
++               * through InitWait to Connected fast enough, we can
++               * get Connected twice here.
++               */
++              if (dev->state != XenbusStateConnected)
++                      goto InitWait; /* no InitWait seen yet, fudge it */
++
++
++              if (xenbus_scanf(XBT_NIL, dev->otherend,
++                                      "feature-resize", "%d", &val) < 0)
++                      val = 0;
++              info->feature_resize = val;
++
++              if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
++                               "request-update", "%d", &val) < 0)
++                      val = 0;
++
++              if (val && !info->kthread) {
++                      info->kthread = kthread_run(xenfb_thread, info,
++                                                  "xenfb thread");
++                      if (IS_ERR(info->kthread)) {
++                              info->kthread = NULL;
++                              xenbus_dev_fatal(dev, PTR_ERR(info->kthread),
++                                              "xenfb_thread");
++                      }
++              }
++              break;
++
++      case XenbusStateClosing:
++              // FIXME is this safe in any dev->state?
++              xenbus_frontend_closed(dev);
++              break;
++      }
++}
++
++static const struct xenbus_device_id xenfb_ids[] = {
++      { "vfb" },
++      { "" }
++};
++MODULE_ALIAS("xen:vfb");
++
++static struct xenbus_driver xenfb_driver = {
++      .name = "vfb",
++      .ids = xenfb_ids,
++      .probe = xenfb_probe,
++      .remove = xenfb_remove,
++      .resume = xenfb_resume,
++      .otherend_changed = xenfb_backend_changed,
++};
++
++static int __init xenfb_init(void)
++{
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      /* Nothing to do if running in dom0. */
++      if (is_initial_xendomain())
++              return -ENODEV;
++
++      return xenbus_register_frontend(&xenfb_driver);
++}
++
++static void __exit xenfb_cleanup(void)
++{
++      return xenbus_unregister_driver(&xenfb_driver);
++}
++
++module_init(xenfb_init);
++module_exit(xenfb_cleanup);
++
++MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
++MODULE_LICENSE("GPL");
diff --cc drivers/xen/fbfront/xenkbd.c

index 0000000,0000000..359cefe

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/fbfront/xenkbd.c
@@@ -1,0 -1,0 +1,368 @@@
++/*
++ * linux/drivers/input/keyboard/xenkbd.c -- Xen para-virtual input device
++ *
++ * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
++ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
++ *
++ *  Based on linux/drivers/input/mouse/sermouse.c
++ *
++ *  This file is subject to the terms and conditions of the GNU General Public
++ *  License. See the file COPYING in the main directory of this archive for
++ *  more details.
++ */
++
++/*
++ * TODO:
++ *
++ * Switch to grant tables together with xenfb.c.
++ */
++
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/input.h>
++#include <asm/hypervisor.h>
++#include <xen/evtchn.h>
++#include <xen/interface/io/fbif.h>
++#include <xen/interface/io/kbdif.h>
++#include <xen/xenbus.h>
++
++struct xenkbd_info
++{
++      struct input_dev *kbd;
++      struct input_dev *ptr;
++      struct xenkbd_page *page;
++      int irq;
++      struct xenbus_device *xbdev;
++      char phys[32];
++};
++
++static int xenkbd_remove(struct xenbus_device *);
++static int xenkbd_connect_backend(struct xenbus_device *, struct xenkbd_info *);
++static void xenkbd_disconnect_backend(struct xenkbd_info *);
++
++/*
++ * Note: if you need to send out events, see xenfb_do_update() for how
++ * to do that.
++ */
++
++static irqreturn_t input_handler(int rq, void *dev_id)
++{
++      struct xenkbd_info *info = dev_id;
++      struct xenkbd_page *page = info->page;
++      __u32 cons, prod;
++
++      prod = page->in_prod;
++      if (prod == page->in_cons)
++              return IRQ_HANDLED;
++      rmb();                  /* ensure we see ring contents up to prod */
++      for (cons = page->in_cons; cons != prod; cons++) {
++              union xenkbd_in_event *event;
++              struct input_dev *dev;
++              event = &XENKBD_IN_RING_REF(page, cons);
++
++              dev = info->ptr;
++              switch (event->type) {
++              case XENKBD_TYPE_MOTION:
++                      if (event->motion.rel_z)
++                              input_report_rel(dev, REL_WHEEL,
++                                               -event->motion.rel_z);
++                      input_report_rel(dev, REL_X, event->motion.rel_x);
++                      input_report_rel(dev, REL_Y, event->motion.rel_y);
++                      break;
++              case XENKBD_TYPE_KEY:
++                      dev = NULL;
++                      if (test_bit(event->key.keycode, info->kbd->keybit))
++                              dev = info->kbd;
++                      if (test_bit(event->key.keycode, info->ptr->keybit))
++                              dev = info->ptr;
++                      if (dev)
++                              input_report_key(dev, event->key.keycode,
++                                               event->key.pressed);
++                      else
++                              pr_warning("xenkbd: unhandled keycode 0x%x\n",
++                                         event->key.keycode);
++                      break;
++              case XENKBD_TYPE_POS:
++                      if (event->pos.rel_z)
++                              input_report_rel(dev, REL_WHEEL,
++                                               -event->pos.rel_z);
++                      input_report_abs(dev, ABS_X, event->pos.abs_x);
++                      input_report_abs(dev, ABS_Y, event->pos.abs_y);
++                      break;
++              }
++              if (dev)
++                      input_sync(dev);
++      }
++      mb();                   /* ensure we got ring contents */
++      page->in_cons = cons;
++      notify_remote_via_irq(info->irq);
++
++      return IRQ_HANDLED;
++}
++
++int __devinit xenkbd_probe(struct xenbus_device *dev,
++                         const struct xenbus_device_id *id)
++{
++      int ret, i, abs;
++      struct xenkbd_info *info;
++      struct input_dev *kbd, *ptr;
++
++      info = kzalloc(sizeof(*info), GFP_KERNEL);
++      if (!info) {
++              xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
++              return -ENOMEM;
++      }
++      dev_set_drvdata(&dev->dev, info);
++      info->xbdev = dev;
++      snprintf(info->phys, sizeof(info->phys), "xenbus/%s", dev->nodename);
++
++      info->page = (void *)__get_free_page(GFP_KERNEL);
++      if (!info->page)
++              goto error_nomem;
++      info->page->in_cons = info->page->in_prod = 0;
++      info->page->out_cons = info->page->out_prod = 0;
++
++      if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-abs-pointer", "%d", &abs) < 0)
++              abs = 0;
++      if (abs)
++              xenbus_printf(XBT_NIL, dev->nodename, "request-abs-pointer", "1");
++
++      /* keyboard */
++      kbd = input_allocate_device();
++      if (!kbd)
++              goto error_nomem;
++      kbd->name = "Xen Virtual Keyboard";
++      kbd->phys = info->phys;
++      kbd->id.bustype = BUS_PCI;
++      kbd->id.vendor = 0x5853;
++      kbd->id.product = 0xffff;
++      kbd->evbit[0] = BIT(EV_KEY);
++      for (i = KEY_ESC; i < KEY_UNKNOWN; i++)
++              set_bit(i, kbd->keybit);
++      for (i = KEY_OK; i < KEY_MAX; i++)
++              set_bit(i, kbd->keybit);
++
++      ret = input_register_device(kbd);
++      if (ret) {
++              input_free_device(kbd);
++              xenbus_dev_fatal(dev, ret, "input_register_device(kbd)");
++              goto error;
++      }
++      info->kbd = kbd;
++
++      /* pointing device */
++      ptr = input_allocate_device();
++      if (!ptr)
++              goto error_nomem;
++      ptr->name = "Xen Virtual Pointer";
++      ptr->phys = info->phys;
++      ptr->id.bustype = BUS_PCI;
++      ptr->id.vendor = 0x5853;
++      ptr->id.product = 0xfffe;
++
++      if (abs) {
++              __set_bit(EV_ABS, ptr->evbit);
++              input_set_abs_params(ptr, ABS_X, 0, XENFB_WIDTH, 0, 0);
++              input_set_abs_params(ptr, ABS_Y, 0, XENFB_HEIGHT, 0, 0);
++      } else {
++              __set_bit(REL_X, ptr->relbit);
++              __set_bit(REL_Y, ptr->relbit);
++      }
++      __set_bit(REL_WHEEL, ptr->relbit);
++
++      __set_bit(EV_KEY, ptr->evbit);
++      for (i = BTN_LEFT; i <= BTN_TASK; i++)
++              __set_bit(i, ptr->keybit);
++
++      ret = input_register_device(ptr);
++      if (ret) {
++              input_free_device(ptr);
++              xenbus_dev_fatal(dev, ret, "input_register_device(ptr)");
++              goto error;
++      }
++      info->ptr = ptr;
++
++      ret = xenkbd_connect_backend(dev, info);
++      if (ret < 0)
++              goto error;
++
++      return 0;
++
++ error_nomem:
++      ret = -ENOMEM;
++      xenbus_dev_fatal(dev, ret, "allocating device memory");
++ error:
++      xenkbd_remove(dev);
++      return ret;
++}
++
++static int xenkbd_resume(struct xenbus_device *dev)
++{
++      struct xenkbd_info *info = dev_get_drvdata(&dev->dev);
++
++      xenkbd_disconnect_backend(info);
++      info->page->in_cons = info->page->in_prod = 0;
++      info->page->out_cons = info->page->out_prod = 0;
++      return xenkbd_connect_backend(dev, info);
++}
++
++static int xenkbd_remove(struct xenbus_device *dev)
++{
++      struct xenkbd_info *info = dev_get_drvdata(&dev->dev);
++
++      xenkbd_disconnect_backend(info);
++      input_unregister_device(info->kbd);
++      input_unregister_device(info->ptr);
++      free_page((unsigned long)info->page);
++      kfree(info);
++      return 0;
++}
++
++static int xenkbd_connect_backend(struct xenbus_device *dev,
++                                struct xenkbd_info *info)
++{
++      int ret;
++      struct xenbus_transaction xbt;
++
++      ret = bind_listening_port_to_irqhandler(
++              dev->otherend_id, input_handler, 0, "xenkbd", info);
++      if (ret < 0) {
++              xenbus_dev_fatal(dev, ret,
++                               "bind_listening_port_to_irqhandler");
++              return ret;
++      }
++      info->irq = ret;
++
++ again:
++      ret = xenbus_transaction_start(&xbt);
++      if (ret) {
++              xenbus_dev_fatal(dev, ret, "starting transaction");
++              return ret;
++      }
++      ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
++                          virt_to_mfn(info->page));
++      if (ret)
++              goto error_xenbus;
++      ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
++                          irq_to_evtchn_port(info->irq));
++      if (ret)
++              goto error_xenbus;
++      ret = xenbus_transaction_end(xbt, 0);
++      if (ret) {
++              if (ret == -EAGAIN)
++                      goto again;
++              xenbus_dev_fatal(dev, ret, "completing transaction");
++              return ret;
++      }
++
++      xenbus_switch_state(dev, XenbusStateInitialised);
++      return 0;
++
++ error_xenbus:
++      xenbus_transaction_end(xbt, 1);
++      xenbus_dev_fatal(dev, ret, "writing xenstore");
++      return ret;
++}
++
++static void xenkbd_disconnect_backend(struct xenkbd_info *info)
++{
++      if (info->irq >= 0)
++              unbind_from_irqhandler(info->irq, info);
++      info->irq = -1;
++}
++
++static void xenkbd_backend_changed(struct xenbus_device *dev,
++                                 enum xenbus_state backend_state)
++{
++      struct xenkbd_info *info = dev_get_drvdata(&dev->dev);
++      int ret, val;
++
++      switch (backend_state) {
++      case XenbusStateInitialising:
++      case XenbusStateInitialised:
++      case XenbusStateReconfiguring:
++      case XenbusStateReconfigured:
++      case XenbusStateUnknown:
++      case XenbusStateClosed:
++              break;
++
++      case XenbusStateInitWait:
++      InitWait:
++              ret = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
++                                 "feature-abs-pointer", "%d", &val);
++              if (ret < 0)
++                      val = 0;
++              if (val) {
++                      ret = xenbus_printf(XBT_NIL, info->xbdev->nodename,
++                                          "request-abs-pointer", "1");
++                      if (ret)
++                              ; /* FIXME */
++              }
++              xenbus_switch_state(dev, XenbusStateConnected);
++              break;
++
++      case XenbusStateConnected:
++              /*
++               * Work around xenbus race condition: If backend goes
++               * through InitWait to Connected fast enough, we can
++               * get Connected twice here.
++               */
++              if (dev->state != XenbusStateConnected)
++                      goto InitWait; /* no InitWait seen yet, fudge it */
++
++              /* Set input abs params to match backend screen res */
++              if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
++                                 "width", "%d", &val) > 0 )
++                      input_set_abs_params(info->ptr, ABS_X, 0, val, 0, 0);
++
++              if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
++                                 "height", "%d", &val) > 0 )
++                      input_set_abs_params(info->ptr, ABS_Y, 0, val, 0, 0);
++
++              break;
++
++      case XenbusStateClosing:
++              xenbus_frontend_closed(dev);
++              break;
++      }
++}
++
++static const struct xenbus_device_id xenkbd_ids[] = {
++      { "vkbd" },
++      { "" }
++};
++MODULE_ALIAS("xen:vkbd");
++
++static struct xenbus_driver xenkbd_driver = {
++      .name = "vkbd",
++      .ids = xenkbd_ids,
++      .probe = xenkbd_probe,
++      .remove = xenkbd_remove,
++      .resume = xenkbd_resume,
++      .otherend_changed = xenkbd_backend_changed,
++};
++
++static int __init xenkbd_init(void)
++{
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      /* Nothing to do if running in dom0. */
++      if (is_initial_xendomain())
++              return -ENODEV;
++
++      return xenbus_register_frontend(&xenkbd_driver);
++}
++
++static void __exit xenkbd_cleanup(void)
++{
++      return xenbus_unregister_driver(&xenkbd_driver);
++}
++
++module_init(xenkbd_init);
++module_exit(xenkbd_cleanup);
++
++MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
++MODULE_LICENSE("GPL");
diff --cc drivers/xen/features.c

index 99eda16,99eda16..fece0f2
--- 1/drivers/xen/features.c
--- 2/drivers/xen/features.c
+++ b/drivers/xen/features.c
@@@ -9,14 -9,14 +9,21 @@@
   #include <linux/cache.h>
   #include <linux/module.h>
   
++#ifdef CONFIG_PARAVIRT_XEN
   #include <asm/xen/hypercall.h>
++#else
++#include <asm/hypervisor.h>
++#endif
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
   
   #include <xen/interface/xen.h>
   #include <xen/interface/version.h>
   #include <xen/features.h>
   
   u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
--EXPORT_SYMBOL_GPL(xen_features);
++EXPORT_SYMBOL(xen_features);
   
   void xen_setup_features(void)
   {
diff --cc drivers/xen/gntdev/Makefile

index 0000000,0000000..8bd8c62

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/gntdev/Makefile
@@@ -1,0 -1,0 +1,1 @@@
++obj-$(CONFIG_XEN_GRANT_DEV) := gntdev.o
diff --cc drivers/xen/gntdev/gntdev.c

index 0000000,0000000..cfd063b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/gntdev/gntdev.c
@@@ -1,0 -1,0 +1,1042 @@@
++/******************************************************************************
++ * gntdev.c
++ * 
++ * Device for accessing (in user-space) pages that have been granted by other
++ * domains.
++ *
++ * Copyright (c) 2006-2007, D G Murray.
++ * 
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ * 
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ */
++
++#include <asm/atomic.h>
++#include <linux/module.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/fs.h>
++#include <linux/device.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <linux/mman.h>
++#include <asm/uaccess.h>
++#include <asm/io.h>
++#include <xen/gnttab.h>
++#include <asm/hypervisor.h>
++#include <xen/balloon.h>
++#include <xen/evtchn.h>
++#include <xen/driver_util.h>
++
++#include <linux/types.h>
++#include <xen/public/gntdev.h>
++
++
++#define DRIVER_AUTHOR "Derek G. Murray <Derek.Murray@cl.cam.ac.uk>"
++#define DRIVER_DESC   "User-space granted page access driver"
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR(DRIVER_AUTHOR);
++MODULE_DESCRIPTION(DRIVER_DESC);
++
++#define GNTDEV_NAME "gntdev"
++MODULE_ALIAS("devname:xen/" GNTDEV_NAME);
++
++#define MAX_GRANTS_LIMIT   1024
++#define DEFAULT_MAX_GRANTS 128
++
++/* A slot can be in one of three states:
++ *
++ * 0. GNTDEV_SLOT_INVALID:
++ *    This slot is not associated with a grant reference, and is therefore free
++ *    to be overwritten by a new grant reference.
++ *
++ * 1. GNTDEV_SLOT_NOT_YET_MAPPED:
++ *    This slot is associated with a grant reference (via the 
++ *    IOCTL_GNTDEV_MAP_GRANT_REF ioctl), but it has not yet been mmap()-ed.
++ *
++ * 2. GNTDEV_SLOT_MAPPED:
++ *    This slot is associated with a grant reference, and has been mmap()-ed.
++ */
++typedef enum gntdev_slot_state {
++      GNTDEV_SLOT_INVALID = 0,
++      GNTDEV_SLOT_NOT_YET_MAPPED,
++      GNTDEV_SLOT_MAPPED
++} gntdev_slot_state_t;
++
++#define GNTDEV_INVALID_HANDLE    -1
++#define GNTDEV_FREE_LIST_INVALID -1
++/* Each opened instance of gntdev is associated with a list of grants,
++ * represented by an array of elements of the following type,
++ * gntdev_grant_info_t.
++ */
++typedef struct gntdev_grant_info {
++      gntdev_slot_state_t state;
++      union {
++              uint32_t free_list_index;
++              struct {
++                      domid_t domid;
++                      grant_ref_t ref;
++                      grant_handle_t kernel_handle;
++                      grant_handle_t user_handle;
++                      uint64_t dev_bus_addr;
++              } valid;
++      } u;
++} gntdev_grant_info_t;
++
++/* Private data structure, which is stored in the file pointer for files
++ * associated with this device.
++ */
++typedef struct gntdev_file_private_data {
++  
++      /* Array of grant information. */
++      gntdev_grant_info_t *grants;
++      uint32_t grants_size;
++
++      /* Read/write semaphore used to protect the grants array. */
++      struct rw_semaphore grants_sem;
++
++      /* An array of indices of free slots in the grants array.
++       * N.B. An entry in this list may temporarily have the value
++       * GNTDEV_FREE_LIST_INVALID if the corresponding slot has been removed
++       * from the list by the contiguous allocator, but the list has not yet
++       * been compressed. However, this is not visible across invocations of
++       * the device.
++       */
++      int32_t *free_list;
++      
++      /* The number of free slots in the grants array. */
++      uint32_t free_list_size;
++
++      /* Read/write semaphore used to protect the free list. */
++      struct rw_semaphore free_list_sem;
++      
++      /* Index of the next slot after the most recent contiguous allocation, 
++       * for use in a next-fit allocator.
++       */
++      uint32_t next_fit_index;
++
++      /* Used to map grants into the kernel, before mapping them into user
++       * space.
++       */
++      struct page **foreign_pages;
++
++} gntdev_file_private_data_t;
++
++/* Module lifecycle operations. */
++static int __init gntdev_init(void);
++static void __exit gntdev_exit(void);
++
++module_init(gntdev_init);
++module_exit(gntdev_exit);
++
++/* File operations. */
++static int gntdev_open(struct inode *inode, struct file *flip);
++static int gntdev_release(struct inode *inode, struct file *flip);
++static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma);
++static long gntdev_ioctl(struct file *flip,
++                       unsigned int cmd, unsigned long arg);
++
++static const struct file_operations gntdev_fops = {
++      .owner = THIS_MODULE,
++      .open = gntdev_open,
++      .llseek = no_llseek,
++      .release = gntdev_release,
++      .mmap = gntdev_mmap,
++      .unlocked_ioctl = gntdev_ioctl
++};
++
++/* VM operations. */
++static void gntdev_vma_close(struct vm_area_struct *vma);
++static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr,
++                            pte_t *ptep, int is_fullmm);
++
++static struct vm_operations_struct gntdev_vmops = {
++      .close = gntdev_vma_close,
++      .zap_pte = gntdev_clear_pte
++};
++
++/* Global variables. */
++
++/* The driver major number, for use when unregistering the driver. */
++static int gntdev_major;
++
++/* Memory mapping functions
++ * ------------------------
++ *
++ * Every granted page is mapped into both kernel and user space, and the two
++ * following functions return the respective virtual addresses of these pages.
++ *
++ * When shadow paging is disabled, the granted page is mapped directly into
++ * user space; when it is enabled, it is mapped into the kernel and remapped
++ * into user space using vm_insert_page() (see gntdev_mmap(), below).
++ */
++
++/* Returns the virtual address (in user space) of the @page_index'th page
++ * in the given VM area.
++ */
++static inline unsigned long get_user_vaddr (struct vm_area_struct *vma,
++                                          int page_index)
++{
++      return (unsigned long) vma->vm_start + (page_index << PAGE_SHIFT);
++}
++
++/* Returns the virtual address (in kernel space) of the @slot_index'th page
++ * mapped by the gntdev instance that owns the given private data struct.
++ */
++static inline unsigned long get_kernel_vaddr (gntdev_file_private_data_t *priv,
++                                            int slot_index)
++{
++      unsigned long pfn;
++      void *kaddr;
++      pfn = page_to_pfn(priv->foreign_pages[slot_index]);
++      kaddr = pfn_to_kaddr(pfn);
++      return (unsigned long) kaddr;
++}
++
++/* Helper functions. */
++
++/* Adds information about a grant reference to the list of grants in the file's
++ * private data structure. Returns non-zero on failure. On success, sets the
++ * value of *offset to the offset that should be mmap()-ed in order to map the
++ * grant reference.
++ */
++static int add_grant_reference(gntdev_file_private_data_t *private_data,
++                             struct ioctl_gntdev_grant_ref *op,
++                             uint64_t *offset)
++{
++      uint32_t slot_index;
++
++      slot_index = private_data->free_list[--private_data->free_list_size];
++      private_data->free_list[private_data->free_list_size]
++              = GNTDEV_FREE_LIST_INVALID;
++
++      /* Copy the grant information into file's private data. */
++      private_data->grants[slot_index].state = GNTDEV_SLOT_NOT_YET_MAPPED;
++      private_data->grants[slot_index].u.valid.domid = op->domid;
++      private_data->grants[slot_index].u.valid.ref = op->ref;
++
++      /* The offset is calculated as the index of the chosen entry in the
++       * file's private data's array of grant information. This is then
++       * shifted to give an offset into the virtual "file address space".
++       */
++      *offset = slot_index << PAGE_SHIFT;
++
++      return 0;
++}
++
++/* Adds the @count grant references to the contiguous range in the slot array
++ * beginning at @first_slot. It is assumed that @first_slot was returned by a
++ * previous invocation of find_contiguous_free_range(), during the same
++ * invocation of the driver.
++ */
++static int add_grant_references(gntdev_file_private_data_t *private_data,
++                              uint32_t count,
++                              struct ioctl_gntdev_grant_ref *ops,
++                              uint32_t first_slot)
++{
++      uint32_t i;
++      
++      for (i = 0; i < count; ++i) {
++
++              /* First, mark the slot's entry in the free list as invalid. */
++              uint32_t free_list_index =
++                      private_data->grants[first_slot+i].u.free_list_index;
++              private_data->free_list[free_list_index] = 
++                      GNTDEV_FREE_LIST_INVALID;
++
++              /* Now, update the slot. */
++              private_data->grants[first_slot+i].state = 
++                      GNTDEV_SLOT_NOT_YET_MAPPED;
++              private_data->grants[first_slot+i].u.valid.domid =
++                      ops[i].domid;
++              private_data->grants[first_slot+i].u.valid.ref = ops[i].ref;
++      }
++
++      return 0;       
++}
++
++/* Scans through the free list for @flip, removing entries that are marked as
++ * GNTDEV_SLOT_INVALID. This will reduce the recorded size of the free list to
++ * the number of valid entries.
++ */
++static void compress_free_list(gntdev_file_private_data_t *private_data)
++{
++      uint32_t i, j = 0, old_size;
++      
++      old_size = private_data->free_list_size;
++      for (i = 0; i < old_size; ++i) {
++              if (private_data->free_list[i] != GNTDEV_FREE_LIST_INVALID) {
++                      if (i > j) {
++                              int32_t slot_index;
++
++                              slot_index = private_data->free_list[i];
++                              private_data->free_list[j] = slot_index;
++                              private_data->grants[slot_index].u
++                                      .free_list_index = j;
++                              private_data->free_list[i] 
++                                      = GNTDEV_FREE_LIST_INVALID;
++                      }
++                      ++j;
++              } else {
++                      --private_data->free_list_size;
++              }
++      }
++}
++
++/* Searches the grant array in the private data of @flip for a range of
++ * @num_slots contiguous slots in the GNTDEV_SLOT_INVALID state.
++ *
++ * Returns the index of the first slot if a range is found, otherwise -ENOMEM.
++ */
++static int find_contiguous_free_range(gntdev_file_private_data_t *private_data,
++                                    uint32_t num_slots) 
++{
++      uint32_t i, start_index = private_data->next_fit_index;
++      uint32_t range_start = 0, range_length;
++
++      /* First search from the start_index to the end of the array. */
++      range_length = 0;
++      for (i = start_index; i < private_data->grants_size; ++i) {
++              if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) {
++                      if (range_length == 0) {
++                              range_start = i;
++                      }
++                      ++range_length;
++                      if (range_length == num_slots) {
++                              return range_start;
++                      }
++              }
++      }
++      
++      /* Now search from the start of the array to the start_index. */
++      range_length = 0;
++      for (i = 0; i < start_index; ++i) {
++              if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) {
++                      if (range_length == 0) {
++                              range_start = i;
++                      }
++                      ++range_length;
++                      if (range_length == num_slots) {
++                              return range_start;
++                      }
++              }
++      }
++      
++      return -ENOMEM;
++}
++
++static int init_private_data(gntdev_file_private_data_t *priv,
++                           uint32_t max_grants)
++{
++      int i;
++
++      /* Allocate space for the kernel-mapping of granted pages. */
++      priv->foreign_pages = 
++              alloc_empty_pages_and_pagevec(max_grants);
++      if (!priv->foreign_pages)
++              goto nomem_out;
++
++      /* Allocate the grant list and free-list. */
++      priv->grants = kmalloc(max_grants * sizeof(gntdev_grant_info_t),
++                             GFP_KERNEL);
++      if (!priv->grants)
++              goto nomem_out2;
++      priv->free_list = kmalloc(max_grants * sizeof(int32_t), GFP_KERNEL);
++      if (!priv->free_list)
++              goto nomem_out3;
++
++      /* Initialise the free-list, which contains all slots at first. */
++      for (i = 0; i < max_grants; ++i) {
++              priv->free_list[max_grants - i - 1] = i;
++              priv->grants[i].state = GNTDEV_SLOT_INVALID;
++              priv->grants[i].u.free_list_index = max_grants - i - 1;
++      }
++      priv->grants_size = max_grants;
++      priv->free_list_size = max_grants;
++      priv->next_fit_index = 0;
++
++      return 0;
++
++nomem_out3:
++      kfree(priv->grants);
++nomem_out2:
++      free_empty_pages_and_pagevec(priv->foreign_pages, max_grants);
++nomem_out:
++      return -ENOMEM;
++
++}
++
++/* Interface functions. */
++
++static char *gntdev_devnode(struct device *dev, mode_t *mode)
++{
++      return kstrdup("xen/" GNTDEV_NAME, GFP_KERNEL);
++}
++
++static struct device_type gntdev_type = {
++      .devnode = gntdev_devnode
++};
++
++/* Initialises the driver. Called when the module is loaded. */
++static int __init gntdev_init(void)
++{
++      struct device *device;
++
++      if (!is_running_on_xen()) {
++              pr_err("You must be running Xen to use gntdev\n");
++              return -ENODEV;
++      }
++
++      gntdev_major = __register_chrdev(0, 0, 1, GNTDEV_NAME, &gntdev_fops);
++      if (gntdev_major < 0)
++      {
++              pr_err("Could not register gntdev device\n");
++              return -ENOMEM;
++      }
++
++      /* Note that if the sysfs code fails, we will still initialise the
++       * device, and output the major number so that the device can be
++       * created manually using mknod.
++       */
++      device = xen_class_device_create(&gntdev_type, NULL,
++                                       MKDEV(gntdev_major, 0),
++                                       NULL, GNTDEV_NAME);
++      if (IS_ERR(device)) {
++              pr_err("Error creating gntdev device in xen_class\n");
++              pr_err("gntdev created, major number = %d\n", gntdev_major);
++              return 0;
++      }
++
++      return 0;
++}
++
++/* Cleans up and unregisters the driver. Called when the driver is unloaded.
++ */
++static void __exit gntdev_exit(void)
++{
++      struct class *class;
++      if ((class = get_xen_class()) != NULL)
++              device_destroy(class, MKDEV(gntdev_major, 0));
++      __unregister_chrdev(gntdev_major, 0, 1, GNTDEV_NAME);
++}
++
++/* Called when the device is opened. */
++static int gntdev_open(struct inode *inode, struct file *flip)
++{
++      gntdev_file_private_data_t *private_data;
++
++      nonseekable_open(inode, flip);
++
++      try_module_get(THIS_MODULE);
++
++      /* Allocate space for the per-instance private data. */
++      private_data = kmalloc(sizeof(*private_data), GFP_KERNEL);
++      if (!private_data)
++              goto nomem_out;
++
++      /* These will be lazily initialised by init_private_data. */
++      private_data->grants = NULL;
++      private_data->free_list = NULL;
++      private_data->foreign_pages = NULL;
++
++      init_rwsem(&private_data->grants_sem);
++      init_rwsem(&private_data->free_list_sem);
++
++      flip->private_data = private_data;
++
++      return 0;
++
++nomem_out:
++      return -ENOMEM;
++}
++
++/* Called when the device is closed.
++ */
++static int gntdev_release(struct inode *inode, struct file *flip)
++{
++      if (flip->private_data) {
++              gntdev_file_private_data_t *private_data = 
++                      (gntdev_file_private_data_t *) flip->private_data;
++              if (private_data->foreign_pages)
++                      free_empty_pages_and_pagevec
++                              (private_data->foreign_pages,
++                               private_data->grants_size);
++              if (private_data->grants) 
++                      kfree(private_data->grants);
++              if (private_data->free_list)
++                      kfree(private_data->free_list);
++              kfree(private_data);
++      }
++      module_put(THIS_MODULE);
++      return 0;
++}
++
++/* Called when an attempt is made to mmap() the device. The private data from
++ * @flip contains the list of grant references that can be mapped. The vm_pgoff
++ * field of @vma contains the index into that list that refers to the grant
++ * reference that will be mapped. Only mappings that are a multiple of
++ * PAGE_SIZE are handled.
++ */
++static int gntdev_mmap (struct file *flip, struct vm_area_struct *vma) 
++{
++      struct gnttab_map_grant_ref op;
++      unsigned long slot_index = vma->vm_pgoff;
++      unsigned long kernel_vaddr, user_vaddr;
++      uint32_t size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++      uint64_t ptep;
++      int ret, exit_ret;
++      int flags;
++      int i;
++      struct page *page;
++      gntdev_file_private_data_t *private_data = flip->private_data;
++
++      if (unlikely(!private_data)) {
++              pr_err("file's private data is NULL\n");
++              return -EINVAL;
++      }
++
++      /* Test to make sure that the grants array has been initialised. */
++      down_read(&private_data->grants_sem);
++      if (unlikely(!private_data->grants)) {
++              up_read(&private_data->grants_sem);
++              pr_err("attempted to mmap before ioctl\n");
++              return -EINVAL;
++      }
++      up_read(&private_data->grants_sem);
++
++      if (unlikely((size <= 0) || 
++                   (size + slot_index) > private_data->grants_size)) {
++              pr_err("Invalid number of pages or offset"
++                     "(num_pages = %d, first_slot = %ld)\n",
++                     size, slot_index);
++              return -ENXIO;
++      }
++
++      if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) {
++              pr_err("writable mappings must be shared\n");
++              return -EINVAL;
++      }
++
++      /* Slots must be in the NOT_YET_MAPPED state. */
++      down_write(&private_data->grants_sem);
++      for (i = 0; i < size; ++i) {
++              if (private_data->grants[slot_index + i].state != 
++                  GNTDEV_SLOT_NOT_YET_MAPPED) {
++                      pr_err("Slot (index = %ld) is in the wrong "
++                             "state (%d)\n", slot_index + i,
++                             private_data->grants[slot_index + i].state);
++                      up_write(&private_data->grants_sem);
++                      return -EINVAL;
++              }
++      }
++
++      /* Install the hook for unmapping. */
++      vma->vm_ops = &gntdev_vmops;
++    
++      /* The VM area contains pages from another VM. */
++      vma->vm_flags |= VM_FOREIGN;
++      vma->vm_private_data = kzalloc(size * sizeof(struct page *),
++                                     GFP_KERNEL);
++      if (vma->vm_private_data == NULL) {
++              pr_err("couldn't allocate mapping structure for VM area\n");
++              return -ENOMEM;
++      }
++
++      /* This flag prevents Bad PTE errors when the memory is unmapped. */
++      vma->vm_flags |= VM_RESERVED;
++
++      /* This flag prevents this VM area being copied on a fork(). A better
++       * behaviour might be to explicitly carry out the appropriate mappings
++       * on fork(), but I don't know if there's a hook for this.
++       */
++      vma->vm_flags |= VM_DONTCOPY;
++
++#ifdef CONFIG_X86
++      /* This flag ensures that the page tables are not unpinned before the
++       * VM area is unmapped. Therefore Xen still recognises the PTE as
++       * belonging to an L1 pagetable, and the grant unmap operation will
++       * succeed, even if the process does not exit cleanly.
++       */
++      vma->vm_mm->context.has_foreign_mappings = 1;
++#endif
++
++      exit_ret = -ENOMEM;
++      for (i = 0; i < size; ++i) {
++
++              flags = GNTMAP_host_map;
++              if (!(vma->vm_flags & VM_WRITE))
++                      flags |= GNTMAP_readonly;
++
++              kernel_vaddr = get_kernel_vaddr(private_data, slot_index + i);
++              user_vaddr = get_user_vaddr(vma, i);
++              page = private_data->foreign_pages[slot_index + i];
++
++              gnttab_set_map_op(&op, kernel_vaddr, flags,   
++                                private_data->grants[slot_index+i]
++                                .u.valid.ref, 
++                                private_data->grants[slot_index+i]
++                                .u.valid.domid);
++
++              /* Carry out the mapping of the grant reference. */
++              ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 
++                                              &op, 1);
++              BUG_ON(ret);
++              if (op.status != GNTST_okay) {
++                      if (op.status != GNTST_eagain)
++                              pr_err("Error mapping the grant reference "
++                                     "into the kernel (%d). domid = %d; ref = %d\n",
++                                     op.status,
++                                     private_data->grants[slot_index+i]
++                                     .u.valid.domid,
++                                     private_data->grants[slot_index+i]
++                                     .u.valid.ref);
++                      else
++                              /* Propagate eagain instead of trying to fix it up */
++                              exit_ret = -EAGAIN;
++                      goto undo_map_out;
++              }
++
++              /* Store a reference to the page that will be mapped into user
++               * space.
++               */
++              ((struct page **) vma->vm_private_data)[i] = page;
++
++              /* Mark mapped page as reserved. */
++              SetPageReserved(page);
++
++              /* Record the grant handle, for use in the unmap operation. */
++              private_data->grants[slot_index+i].u.valid.kernel_handle = 
++                      op.handle;
++              private_data->grants[slot_index+i].u.valid.dev_bus_addr = 
++                      op.dev_bus_addr;
++              
++              private_data->grants[slot_index+i].state = GNTDEV_SLOT_MAPPED;
++              private_data->grants[slot_index+i].u.valid.user_handle =
++                      GNTDEV_INVALID_HANDLE;
++
++              /* Now perform the mapping to user space. */
++              if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++
++                      /* NOT USING SHADOW PAGE TABLES. */
++                      /* In this case, we map the grant(s) straight into user
++                       * space.
++                       */
++
++                      /* Get the machine address of the PTE for the user 
++                       *  page.
++                       */
++                      if ((ret = create_lookup_pte_addr(vma->vm_mm, 
++                                                        vma->vm_start 
++                                                        + (i << PAGE_SHIFT), 
++                                                        &ptep)))
++                      {
++                              pr_err("Error obtaining PTE pointer (%d)\n",
++                                     ret);
++                              goto undo_map_out;
++                      }
++                      
++                      /* Configure the map operation. */
++              
++                      /* The reference is to be used by host CPUs. */
++                      flags = GNTMAP_host_map;
++                      
++                      /* Specifies a user space mapping. */
++                      flags |= GNTMAP_application_map;
++                      
++                      /* The map request contains the machine address of the
++                       * PTE to update.
++                       */
++                      flags |= GNTMAP_contains_pte;
++                      
++                      if (!(vma->vm_flags & VM_WRITE))
++                              flags |= GNTMAP_readonly;
++
++                      gnttab_set_map_op(&op, ptep, flags, 
++                                        private_data->grants[slot_index+i]
++                                        .u.valid.ref, 
++                                        private_data->grants[slot_index+i]
++                                        .u.valid.domid);
++
++                      /* Carry out the mapping of the grant reference. */
++                      ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++                                                      &op, 1);
++                      BUG_ON(ret);
++                      if (op.status != GNTST_okay) {
++                              pr_err("Error mapping the grant "
++                                     "reference into user space (%d). domid "
++                                     "= %d; ref = %d\n", op.status,
++                                     private_data->grants[slot_index+i].u
++                                     .valid.domid,
++                                     private_data->grants[slot_index+i].u
++                                     .valid.ref);
++                              /* This should never happen after we've mapped into
++                              * the kernel space. */
++                              BUG_ON(op.status == GNTST_eagain);
++                              goto undo_map_out;
++                      }
++                      
++                      /* Record the grant handle, for use in the unmap 
++                       * operation. 
++                       */
++                      private_data->grants[slot_index+i].u.
++                              valid.user_handle = op.handle;
++
++                      /* Update p2m structure with the new mapping. */
++                      set_phys_to_machine(__pa(kernel_vaddr) >> PAGE_SHIFT,
++                                          FOREIGN_FRAME(private_data->
++                                                        grants[slot_index+i]
++                                                        .u.valid.dev_bus_addr
++                                                        >> PAGE_SHIFT));
++              } else {
++                      /* USING SHADOW PAGE TABLES. */
++                      /* In this case, we simply insert the page into the VM
++                       * area. */
++                      ret = vm_insert_page(vma, user_vaddr, page);
++              }
++
++      }
++      exit_ret = 0;
++
++      up_write(&private_data->grants_sem);
++      return exit_ret;
++
++undo_map_out:
++      /* If we have a mapping failure, the unmapping will be taken care of
++       * by do_mmap_pgoff(), which will eventually call gntdev_clear_pte().
++       * All we need to do here is free the vma_private_data.
++       */
++      kfree(vma->vm_private_data);
++
++      /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file
++       * to NULL on failure. However, we need this in gntdev_clear_pte() to
++       * unmap the grants. Therefore, we smuggle a reference to the file's
++       * private data in the VM area's private data pointer.
++       */
++      vma->vm_private_data = private_data;
++      
++      up_write(&private_data->grants_sem);
++
++      return exit_ret;
++}
++
++static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr,
++                            pte_t *ptep, int is_fullmm)
++{
++      int slot_index, ret;
++      pte_t copy;
++      struct gnttab_unmap_grant_ref op;
++      gntdev_file_private_data_t *private_data;
++
++      /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file
++       * to NULL on failure. However, we need this in gntdev_clear_pte() to
++       * unmap the grants. Therefore, we smuggle a reference to the file's
++       * private data in the VM area's private data pointer.
++       */
++      if (vma->vm_file) {
++              private_data = (gntdev_file_private_data_t *)
++                      vma->vm_file->private_data;
++      } else if (vma->vm_private_data) {
++              private_data = (gntdev_file_private_data_t *)
++                      vma->vm_private_data;
++      } else {
++              private_data = NULL; /* gcc warning */
++              BUG();
++      }
++
++      /* Calculate the grant relating to this PTE. */
++      slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
++
++      /* Only unmap grants if the slot has been mapped. This could be being
++       * called from a failing mmap().
++       */
++      if (private_data->grants[slot_index].state == GNTDEV_SLOT_MAPPED) {
++
++              /* First, we clear the user space mapping, if it has been made.
++               */
++              if (private_data->grants[slot_index].u.valid.user_handle !=
++                  GNTDEV_INVALID_HANDLE && 
++                  !xen_feature(XENFEAT_auto_translated_physmap)) {
++                      /* NOT USING SHADOW PAGE TABLES. */
++
++                      /* Copy the existing value of the PTE for returning. */
++                      copy = *ptep;
++
++                      gnttab_set_unmap_op(&op, ptep_to_machine(ptep), 
++                                          GNTMAP_contains_pte,
++                                          private_data->grants[slot_index]
++                                          .u.valid.user_handle);
++                      ret = HYPERVISOR_grant_table_op(
++                              GNTTABOP_unmap_grant_ref, &op, 1);
++                      BUG_ON(ret);
++                      if (op.status != GNTST_okay)
++                              pr_warning("User unmap grant status = %d\n",
++                                         op.status);
++              } else {
++                      /* USING SHADOW PAGE TABLES. */
++                      copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
++              }
++
++              /* Finally, we unmap the grant from kernel space. */
++              gnttab_set_unmap_op(&op, 
++                                  get_kernel_vaddr(private_data, slot_index),
++                                  GNTMAP_host_map, 
++                                  private_data->grants[slot_index].u.valid
++                                  .kernel_handle);
++              ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 
++                                              &op, 1);
++              BUG_ON(ret);
++              if (op.status != GNTST_okay)
++                      pr_warning("Kernel unmap grant status = %d\n",
++                                 op.status);
++
++
++              /* Return slot to the not-yet-mapped state, so that it may be
++               * mapped again, or removed by a subsequent ioctl.
++               */
++              private_data->grants[slot_index].state = 
++                      GNTDEV_SLOT_NOT_YET_MAPPED;
++
++              /* Invalidate the physical to machine mapping for this page. */
++              set_phys_to_machine(
++                      page_to_pfn(private_data->foreign_pages[slot_index]),
++                      INVALID_P2M_ENTRY);
++
++      } else {
++              copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
++      }
++
++      return copy;
++}
++
++/* "Destructor" for a VM area.
++ */
++static void gntdev_vma_close(struct vm_area_struct *vma) {
++      if (vma->vm_private_data) {
++              kfree(vma->vm_private_data);
++      }
++}
++
++/* Called when an ioctl is made on the device.
++ */
++static long gntdev_ioctl(struct file *flip,
++                       unsigned int cmd, unsigned long arg)
++{
++      int rc = 0;
++      gntdev_file_private_data_t *private_data = 
++              (gntdev_file_private_data_t *) flip->private_data;
++
++      /* On the first invocation, we will lazily initialise the grant array
++       * and free-list.
++       */
++      if (unlikely(!private_data->grants) 
++          && likely(cmd != IOCTL_GNTDEV_SET_MAX_GRANTS)) {
++              down_write(&private_data->grants_sem);
++              
++              if (unlikely(private_data->grants)) {
++                      up_write(&private_data->grants_sem);
++                      goto private_data_initialised;
++              }
++              
++              /* Just use the default. Setting to a non-default is handled
++               * in the ioctl switch.
++               */
++              rc = init_private_data(private_data, DEFAULT_MAX_GRANTS);
++              
++              up_write(&private_data->grants_sem);
++
++              if (rc) {
++                      pr_err("Initialising gntdev private data failed\n");
++                      return rc;
++              }
++      }
++          
++private_data_initialised:
++      switch (cmd) {
++      case IOCTL_GNTDEV_MAP_GRANT_REF:
++      {
++              struct ioctl_gntdev_map_grant_ref op;
++              struct ioctl_gntdev_grant_ref *refs = NULL;
++
++              if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
++                      return -EFAULT;
++              if (unlikely(op.count <= 0))
++                      return -EINVAL;
++
++              if (op.count > 1 && op.count <= private_data->grants_size) {
++                      struct ioctl_gntdev_grant_ref *u;
++
++                      refs = kmalloc(op.count * sizeof(*refs), GFP_KERNEL);
++                      if (!refs)
++                              return -ENOMEM;
++                      u = ((struct ioctl_gntdev_map_grant_ref *)arg)->refs;
++                      if (copy_from_user(refs, (void __user *)u,
++                                         sizeof(*refs) * op.count)) {
++                              kfree(refs);
++                              return -EFAULT;
++                      }
++              }
++
++              down_write(&private_data->grants_sem);
++              down_write(&private_data->free_list_sem);
++
++              if (unlikely(op.count > private_data->free_list_size)) {
++                      rc = -ENOMEM;
++                      goto map_out;
++              }
++
++              if (op.count == 1) {
++                      if ((rc = add_grant_reference(private_data, op.refs,
++                                                    &op.index)) < 0) {
++                              pr_err("Adding grant reference failed (%d)\n",
++                                     rc);
++                              goto map_out;
++                      }
++              } else {
++                      if ((rc = find_contiguous_free_range(private_data,
++                                                           op.count)) < 0) {
++                              pr_err("Finding contiguous range failed"
++                                     " (%d)\n", rc);
++                              goto map_out;
++                      }
++                      op.index = rc << PAGE_SHIFT;
++                      if ((rc = add_grant_references(private_data, op.count,
++                                                     refs, rc))) {
++                              pr_err("Adding grant references failed (%d)\n",
++                                     rc);
++                              goto map_out;
++                      }
++                      compress_free_list(private_data);
++              }
++
++      map_out:
++              up_write(&private_data->free_list_sem);
++              up_write(&private_data->grants_sem);
++
++              kfree(refs);
++
++              if (!rc && copy_to_user((void __user *)arg, &op, sizeof(op)))
++                      rc = -EFAULT;
++              return rc;
++      }
++      case IOCTL_GNTDEV_UNMAP_GRANT_REF:
++      {
++              struct ioctl_gntdev_unmap_grant_ref op;
++              uint32_t i, start_index;
++
++              if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
++                      return -EFAULT;
++
++              start_index = op.index >> PAGE_SHIFT;
++              if (start_index + op.count > private_data->grants_size)
++                      return -EINVAL;
++
++              down_write(&private_data->grants_sem);
++
++              /* First, check that all pages are in the NOT_YET_MAPPED
++               * state.
++               */
++              for (i = 0; i < op.count; ++i) {
++                      if (unlikely
++                          (private_data->grants[start_index + i].state
++                           != GNTDEV_SLOT_NOT_YET_MAPPED)) {
++                              if (private_data->grants[start_index + i].state
++                                  == GNTDEV_SLOT_INVALID) {
++                                      pr_err("Tried to remove an invalid "
++                                             "grant at offset 0x%x.",
++                                             (start_index + i) 
++                                             << PAGE_SHIFT);
++                                      rc = -EINVAL;
++                              } else {
++                                      pr_err("Tried to remove a grant which "
++                                             "is currently mmap()-ed at "
++                                             "offset 0x%x.",
++                                             (start_index + i) 
++                                             << PAGE_SHIFT);
++                                      rc = -EBUSY;
++                              }
++                              goto unmap_out;
++                      }
++              }
++
++              down_write(&private_data->free_list_sem);
++
++              /* Unmap pages and add them to the free list.
++               */
++              for (i = 0; i < op.count; ++i) {
++                      private_data->grants[start_index+i].state = 
++                              GNTDEV_SLOT_INVALID;
++                      private_data->grants[start_index+i].u.free_list_index =
++                              private_data->free_list_size;
++                      private_data->free_list[private_data->free_list_size] =
++                              start_index + i;
++                      ++private_data->free_list_size;
++              }
++
++              up_write(&private_data->free_list_sem);
++      unmap_out:
++              up_write(&private_data->grants_sem);
++              return rc;
++      }
++      case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
++      {
++              struct ioctl_gntdev_get_offset_for_vaddr op;
++              struct vm_area_struct *vma;
++              unsigned long vaddr;
++
++              if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
++                      return -EFAULT;
++
++              vaddr = (unsigned long)op.vaddr;
++
++              down_read(&current->mm->mmap_sem);              
++              vma = find_vma(current->mm, vaddr);
++              if (!vma || vma->vm_ops != &gntdev_vmops) {
++                      rc = -EFAULT;
++                      goto get_offset_out;
++              }
++              if (vma->vm_start != vaddr) {
++                      pr_err("The vaddr specified in an "
++                             "IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR must be at "
++                             "the start of the VM area. vma->vm_start = "
++                             "%#lx; vaddr = %#lx\n",
++                             vma->vm_start, vaddr);
++                      rc = -EFAULT;
++                      goto get_offset_out;
++              }
++              op.offset = vma->vm_pgoff << PAGE_SHIFT;
++              op.count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++      get_offset_out:
++              up_read(&current->mm->mmap_sem);
++              if (!rc && copy_to_user((void __user *)arg, &op, sizeof(op)))
++                      rc = -EFAULT;
++              return rc;
++      }
++      case IOCTL_GNTDEV_SET_MAX_GRANTS:
++      {
++              struct ioctl_gntdev_set_max_grants op;
++
++              if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
++                      return -EFAULT;
++              if (op.count > MAX_GRANTS_LIMIT)
++                      return -EINVAL;
++
++              down_write(&private_data->grants_sem);
++              if (unlikely(private_data->grants))
++                      rc = -EBUSY;
++              else
++                      rc = init_private_data(private_data, op.count);
++              up_write(&private_data->grants_sem);
++              return rc;
++      }
++      default:
++              return -ENOIOCTLCMD;
++      }
++
++      return 0;
++}
diff --cc drivers/xen/netback/Makefile

index 0000000,0000000..2bb2677

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/netback/Makefile
@@@ -1,0 -1,0 +1,5 @@@
++obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
++obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
++
++netbk-y   := netback.o xenbus.o interface.o accel.o
++netloop-y := loopback.o
diff --cc drivers/xen/netback/accel.c

index 0000000,0000000..b8782c4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/netback/accel.c
@@@ -1,0 -1,0 +1,269 @@@
++/******************************************************************************
++ * drivers/xen/netback/accel.c
++ *
++ * Interface between backend virtual network device and accelerated plugin. 
++ * 
++ * Copyright (C) 2007 Solarflare Communications, Inc
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/list.h>
++#include <asm/atomic.h>
++#include <xen/xenbus.h>
++#include <linux/mutex.h>
++
++#include "common.h"
++
++#if 0
++#undef DPRINTK
++#define DPRINTK(fmt, args...)                                         \
++      printk("netback/accel (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
++#endif
++
++/* 
++ * A list of available netback accelerator plugin modules (each list
++ * entry is of type struct netback_accelerator) 
++ */ 
++static struct list_head accelerators_list;
++/* Lock used to protect access to accelerators_list */
++DEFINE_MUTEX(accelerators_mutex);
++
++/* 
++ * Compare a backend to an accelerator, and decide if they are
++ * compatible (i.e. if the accelerator should be used by the
++ * backend) 
++ */
++static int match_accelerator(struct xenbus_device *xendev,
++                           struct backend_info *be, 
++                           struct netback_accelerator *accelerator)
++{
++      int rc = 0;
++      char *eth_name = xenbus_read(XBT_NIL, xendev->nodename, "accel", NULL);
++      
++      if (IS_ERR(eth_name)) {
++              /* Probably means not present */
++              DPRINTK("%s: no match due to xenbus_read accel error %ld\n",
++                      __FUNCTION__, PTR_ERR(eth_name));
++              return 0;
++      } else {
++              if (!strcmp(eth_name, accelerator->eth_name))
++                      rc = 1;
++              kfree(eth_name);
++              return rc;
++      }
++}
++
++
++static void do_probe(struct backend_info *be, 
++                   struct netback_accelerator *accelerator,
++                   struct xenbus_device *xendev) 
++{
++      be->accelerator = accelerator;
++      atomic_inc(&be->accelerator->use_count);
++      if (be->accelerator->hooks->probe(xendev) != 0) {
++              atomic_dec(&be->accelerator->use_count);
++              module_put(be->accelerator->hooks->owner);
++              be->accelerator = NULL;
++      }
++}
++
++
++/*
++ * Notify suitable backends that a new accelerator is available and
++ * connected.  This will also notify the accelerator plugin module
++ * that it is being used for a device through the probe hook.
++ */
++static int netback_accelerator_probe_backend(struct device *dev, void *arg)
++{
++      struct netback_accelerator *accelerator = 
++              (struct netback_accelerator *)arg;
++      struct xenbus_device *xendev = to_xenbus_device(dev);
++
++      if (!strcmp("vif", xendev->devicetype)) {
++              struct backend_info *be = dev_get_drvdata(&xendev->dev);
++
++              if (match_accelerator(xendev, be, accelerator) &&
++                  try_module_get(accelerator->hooks->owner)) {
++                      do_probe(be, accelerator, xendev);
++              }
++      }
++      return 0;
++}
++
++
++/*
++ * Notify suitable backends that an accelerator is unavailable.
++ */
++static int netback_accelerator_remove_backend(struct device *dev, void *arg)
++{
++      struct xenbus_device *xendev = to_xenbus_device(dev);
++      struct netback_accelerator *accelerator = 
++              (struct netback_accelerator *)arg;
++      
++      if (!strcmp("vif", xendev->devicetype)) {
++              struct backend_info *be = dev_get_drvdata(&xendev->dev);
++
++              if (be->accelerator == accelerator) {
++                      be->accelerator->hooks->remove(xendev);
++                      atomic_dec(&be->accelerator->use_count);
++                      module_put(be->accelerator->hooks->owner);
++                      be->accelerator = NULL;
++              }
++      }
++      return 0;
++}
++
++
++
++/*
++ * Entry point for an netback accelerator plugin module.  Called to
++ * advertise its presence, and connect to any suitable backends.
++ */
++int netback_connect_accelerator(unsigned version, int id, const char *eth_name, 
++                              struct netback_accel_hooks *hooks)
++{
++      struct netback_accelerator *new_accelerator;
++      unsigned eth_name_len;
++
++      if (version != NETBACK_ACCEL_VERSION) {
++              if (version > NETBACK_ACCEL_VERSION) {
++                      /* Caller has higher version number, leave it
++                         up to them to decide whether to continue.
++                         They can recall with a lower number if
++                         they're happy to be compatible with us */
++                      return NETBACK_ACCEL_VERSION;
++              } else {
++                      /* We have a more recent version than caller.
++                         Currently reject, but may in future be able
++                         to be backwardly compatible */
++                      return -EPROTO;
++              }
++      }
++
++      new_accelerator = 
++              kmalloc(sizeof(struct netback_accelerator), GFP_KERNEL);
++      if (!new_accelerator) {
++              DPRINTK("%s: failed to allocate memory for accelerator\n",
++                      __FUNCTION__);
++              return -ENOMEM;
++      }
++
++      new_accelerator->id = id;
++      
++      eth_name_len = strlen(eth_name)+1;
++      new_accelerator->eth_name = kmalloc(eth_name_len, GFP_KERNEL);
++      if (!new_accelerator->eth_name) {
++              DPRINTK("%s: failed to allocate memory for eth_name string\n",
++                      __FUNCTION__);
++              kfree(new_accelerator);
++              return -ENOMEM;
++      }
++      strlcpy(new_accelerator->eth_name, eth_name, eth_name_len);
++      
++      new_accelerator->hooks = hooks;
++
++      atomic_set(&new_accelerator->use_count, 0);
++      
++      mutex_lock(&accelerators_mutex);
++      list_add(&new_accelerator->link, &accelerators_list);
++      
++      /* tell existing backends about new plugin */
++      xenbus_for_each_backend(new_accelerator, 
++                              netback_accelerator_probe_backend);
++
++      mutex_unlock(&accelerators_mutex);
++
++      return 0;
++
++}
++EXPORT_SYMBOL_GPL(netback_connect_accelerator);
++
++
++/* 
++ * Disconnect an accelerator plugin module that has previously been
++ * connected.
++ */
++void netback_disconnect_accelerator(int id, const char *eth_name)
++{
++      struct netback_accelerator *accelerator, *next;
++
++      mutex_lock(&accelerators_mutex);
++      list_for_each_entry_safe(accelerator, next, &accelerators_list, link) {
++              if (!strcmp(eth_name, accelerator->eth_name)) {
++                      xenbus_for_each_backend
++                              (accelerator, netback_accelerator_remove_backend);
++                      BUG_ON(atomic_read(&accelerator->use_count) != 0);
++                      list_del(&accelerator->link);                           
++                      kfree(accelerator->eth_name);
++                      kfree(accelerator);
++                      break;
++              }
++      }
++      mutex_unlock(&accelerators_mutex);
++}
++EXPORT_SYMBOL_GPL(netback_disconnect_accelerator);
++
++
++void netback_probe_accelerators(struct backend_info *be,
++                              struct xenbus_device *dev)
++{
++      struct netback_accelerator *accelerator;
++
++      /* 
++       * Check list of accelerators to see if any is suitable, and
++       * use it if it is.
++       */
++      mutex_lock(&accelerators_mutex);
++      list_for_each_entry(accelerator, &accelerators_list, link) { 
++              if (match_accelerator(dev, be, accelerator) &&
++                  try_module_get(accelerator->hooks->owner)) {
++                      do_probe(be, accelerator, dev);
++                      break;
++              }
++      }
++      mutex_unlock(&accelerators_mutex);
++}
++
++
++void netback_remove_accelerators(struct backend_info *be,
++                               struct xenbus_device *dev)
++{
++      mutex_lock(&accelerators_mutex);
++      /* Notify the accelerator (if any) of this device's removal */
++      if (be->accelerator != NULL) {
++              be->accelerator->hooks->remove(dev);
++              atomic_dec(&be->accelerator->use_count);
++              module_put(be->accelerator->hooks->owner);
++              be->accelerator = NULL;
++      }
++      mutex_unlock(&accelerators_mutex);
++}
++
++
++void netif_accel_init(void)
++{
++      INIT_LIST_HEAD(&accelerators_list);
++}
diff --cc drivers/xen/netback/common.h

index 0000000,0000000..e787550

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/netback/common.h
@@@ -1,0 -1,0 +1,297 @@@
++/******************************************************************************
++ * arch/xen/drivers/netif/backend/common.h
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __NETIF__BACKEND__COMMON_H__
++#define __NETIF__BACKEND__COMMON_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/ip.h>
++#include <linux/in.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/wait.h>
++#include <xen/interface/io/netif.h>
++#include <xen/xenbus.h>
++#include <xen/interface/event_channel.h>
++
++#define DPRINTK(_f, _a...)                    \
++      pr_debug("(file=%s, line=%d) " _f,      \
++               __FILE__ , __LINE__ , ## _a )
++#define IPRINTK(fmt, args...) pr_info("xen_net: " fmt, ##args)
++#define WPRINTK(fmt, args...) pr_warning("xen_net: " fmt, ##args)
++
++typedef struct netif_st {
++      /* Unique identifier for this interface. */
++      domid_t          domid;
++      unsigned int     group;
++      unsigned int     handle;
++
++      u8               fe_dev_addr[6];
++
++      unsigned int     irq;
++
++      /* The shared rings and indexes. */
++      netif_tx_back_ring_t tx;
++      netif_rx_back_ring_t rx;
++      struct vm_struct *tx_comms_area;
++      struct vm_struct *rx_comms_area;
++
++      /* Flags that must not be set in dev->features */
++      int features_disabled;
++
++      /* Frontend feature information. */
++      u8 can_sg:1;
++      u8 gso:1;
++      u8 csum:1;
++
++      /* Internal feature information. */
++      u8 can_queue:1; /* can queue packets for receiver? */
++      u8 copying_receiver:1;  /* copy packets to receiver?       */
++
++      /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
++      RING_IDX rx_req_cons_peek;
++
++      /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
++      unsigned long   credit_bytes;
++      unsigned long   credit_usec;
++      unsigned long   remaining_credit;
++      struct timer_list credit_timeout;
++
++      /* Enforce draining of the transmit queue. */
++      struct timer_list tx_queue_timeout;
++
++      /* Statistics */
++      unsigned long nr_copied_skbs;
++      unsigned long rx_gso_csum_fixups;
++
++      /* Miscellaneous private stuff. */
++      struct list_head list;  /* scheduling list */
++      atomic_t         refcnt;
++      struct net_device *dev;
++
++      unsigned int carrier;
++
++      wait_queue_head_t waiting_to_free;
++} netif_t;
++
++/*
++ * Implement our own carrier flag: the network stack's version causes delays
++ * when the carrier is re-enabled (in particular, dev_activate() may not
++ * immediately be called, which can cause packet loss; also the etherbridge
++ * can be rather lazy in activating its port).
++ */
++#define netback_carrier_on(netif)     ((netif)->carrier = 1)
++#define netback_carrier_off(netif)    ((netif)->carrier = 0)
++#define netback_carrier_ok(netif)     ((netif)->carrier)
++
++enum {
++      NETBK_DONT_COPY_SKB,
++      NETBK_DELAYED_COPY_SKB,
++      NETBK_ALWAYS_COPY_SKB,
++};
++
++extern int netbk_copy_skb_mode;
++
++/* Function pointers into netback accelerator plugin modules */
++struct netback_accel_hooks {
++      struct module *owner;
++      int  (*probe)(struct xenbus_device *dev);
++      int (*remove)(struct xenbus_device *dev);
++};
++
++/* Structure to track the state of a netback accelerator plugin */
++struct netback_accelerator {
++      struct list_head link;
++      int id;
++      char *eth_name;
++      atomic_t use_count;
++      struct netback_accel_hooks *hooks;
++};
++
++struct backend_info {
++      struct xenbus_device *dev;
++      netif_t *netif;
++      enum xenbus_state frontend_state;
++
++      /* State relating to the netback accelerator */
++      void *netback_accel_priv;
++      /* The accelerator that this backend is currently using */
++      struct netback_accelerator *accelerator;
++};
++
++#define NETBACK_ACCEL_VERSION 0x00010001
++
++/* 
++ * Connect an accelerator plugin module to netback.  Returns zero on
++ * success, < 0 on error, > 0 (with highest version number supported)
++ * if version mismatch.
++ */
++extern int netback_connect_accelerator(unsigned version,
++                                     int id, const char *eth_name, 
++                                     struct netback_accel_hooks *hooks);
++/* Disconnect a previously connected accelerator plugin module */
++extern void netback_disconnect_accelerator(int id, const char *eth_name);
++
++
++extern
++void netback_probe_accelerators(struct backend_info *be,
++                              struct xenbus_device *dev);
++extern
++void netback_remove_accelerators(struct backend_info *be,
++                               struct xenbus_device *dev);
++extern
++void netif_accel_init(void);
++
++
++#define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE)
++#define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE)
++
++void netif_disconnect(struct backend_info *be);
++
++void netif_set_features(netif_t *netif);
++netif_t *netif_alloc(struct device *parent, domid_t domid, unsigned int handle);
++int netif_map(struct backend_info *be, grant_ref_t tx_ring_ref,
++            grant_ref_t rx_ring_ref, evtchn_port_t evtchn);
++
++#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
++#define netif_put(_b)                                         \
++      do {                                                    \
++              if ( atomic_dec_and_test(&(_b)->refcnt) )       \
++                      wake_up(&(_b)->waiting_to_free);        \
++      } while (0)
++
++void netif_xenbus_init(void);
++
++#define netif_schedulable(netif)                              \
++      (netif_running((netif)->dev) && netback_carrier_ok(netif))
++
++void netif_schedule_work(netif_t *netif);
++void netif_deschedule_work(netif_t *netif);
++
++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
++irqreturn_t netif_be_int(int irq, void *dev_id);
++
++static inline int netbk_can_queue(struct net_device *dev)
++{
++      netif_t *netif = netdev_priv(dev);
++      return netif->can_queue;
++}
++
++static inline int netbk_can_sg(struct net_device *dev)
++{
++      netif_t *netif = netdev_priv(dev);
++      return netif->can_sg;
++}
++
++struct pending_tx_info {
++      netif_tx_request_t req;
++      netif_t *netif;
++};
++typedef unsigned int pending_ring_idx_t;
++
++struct netbk_rx_meta {
++      skb_frag_t frag;
++      int id;
++      u8 copy:1;
++};
++
++struct netbk_tx_pending_inuse {
++      struct list_head list;
++      unsigned long alloc_time;
++};
++
++#define MAX_PENDING_REQS (1U << CONFIG_XEN_NETDEV_TX_SHIFT)
++#define MAX_MFN_ALLOC 64
++
++struct xen_netbk {
++      union {
++              struct {
++                      struct tasklet_struct net_tx_tasklet;
++                      struct tasklet_struct net_rx_tasklet;
++              };
++              struct {
++                      wait_queue_head_t netbk_action_wq;
++                      struct task_struct *task;
++              };
++      };
++
++      struct sk_buff_head rx_queue;
++      struct sk_buff_head tx_queue;
++
++      struct timer_list net_timer;
++      struct timer_list tx_pending_timer;
++
++      pending_ring_idx_t pending_prod;
++      pending_ring_idx_t pending_cons;
++      pending_ring_idx_t dealloc_prod;
++      pending_ring_idx_t dealloc_cons;
++
++      struct list_head pending_inuse_head;
++      struct list_head schedule_list;
++
++      spinlock_t schedule_list_lock;
++      spinlock_t release_lock;
++
++      struct page **mmap_pages;
++
++      atomic_t nr_groups;
++      unsigned int alloc_index;
++
++      struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
++      struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
++      struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
++      struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
++
++      grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
++      u16 pending_ring[MAX_PENDING_REQS];
++      u16 dealloc_ring[MAX_PENDING_REQS];
++
++      struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
++      struct mmu_update rx_mmu[NET_RX_RING_SIZE];
++      struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
++      struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
++      DECLARE_BITMAP(rx_notify, NR_DYNIRQS);
++#if !defined(NR_DYNIRQS)
++# error
++#elif NR_DYNIRQS <= 0x10000
++      u16 notify_list[NET_RX_RING_SIZE];
++#else
++      int notify_list[NET_RX_RING_SIZE];
++#endif
++      struct netbk_rx_meta meta[NET_RX_RING_SIZE];
++
++      unsigned long mfn_list[MAX_MFN_ALLOC];
++};
++
++extern struct xen_netbk *xen_netbk;
++extern unsigned int netbk_nr_groups;
++
++#endif /* __NETIF__BACKEND__COMMON_H__ */
diff --cc drivers/xen/netback/interface.c

index 0000000,0000000..a58454e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/netback/interface.c
@@@ -1,0 -1,0 +1,414 @@@
++/******************************************************************************
++ * arch/xen/drivers/netif/backend/interface.c
++ * 
++ * Network-device interface management.
++ * 
++ * Copyright (c) 2004-2005, Keir Fraser
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++#include <linux/ethtool.h>
++#include <linux/rtnetlink.h>
++#include <linux/delay.h>
++#include <linux/vmalloc.h>
++#include <xen/evtchn.h>
++
++/*
++ * Module parameter 'queue_length':
++ * 
++ * Enables queuing in the network stack when a client has run out of receive
++ * descriptors. Although this feature can improve receive bandwidth by avoiding
++ * packet loss, it can also result in packets sitting in the 'tx_queue' for
++ * unbounded time. This is bad if those packets hold onto foreign resources.
++ * For example, consider a packet that holds onto resources belonging to the
++ * guest for which it is queued (e.g., packet received on vif1.0, destined for
++ * vif1.1 which is not activated in the guest): in this situation the guest
++ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
++ * run a timer (tx_queue_timeout) to drain the queue when the interface is
++ * blocked.
++ */
++static unsigned long netbk_queue_length = 32;
++module_param_named(queue_length, netbk_queue_length, ulong, 0644);
++
++static void __netif_up(netif_t *netif)
++{
++      unsigned int group = 0;
++      unsigned int min_groups = atomic_read(&xen_netbk[0].nr_groups);
++      unsigned int i;
++
++      /* Find the list which contains least number of domains. */
++      for (i = 1; i < netbk_nr_groups; i++) {
++              unsigned int nr_groups = atomic_read(&xen_netbk[i].nr_groups);
++
++              if (nr_groups < min_groups) {
++                      group = i;
++                      min_groups = nr_groups;
++              }
++      }
++
++      atomic_inc(&xen_netbk[group].nr_groups);
++      netif->group = group;
++
++      enable_irq(netif->irq);
++      netif_schedule_work(netif);
++}
++
++static void __netif_down(netif_t *netif)
++{
++      struct xen_netbk *netbk = xen_netbk + netif->group;
++
++      disable_irq(netif->irq);
++      netif_deschedule_work(netif);
++
++      netif->group = UINT_MAX;
++      atomic_dec(&netbk->nr_groups);
++}
++
++static int net_open(struct net_device *dev)
++{
++      netif_t *netif = netdev_priv(dev);
++      if (netback_carrier_ok(netif)) {
++              __netif_up(netif);
++              netif_start_queue(dev);
++      }
++      return 0;
++}
++
++static int net_close(struct net_device *dev)
++{
++      netif_t *netif = netdev_priv(dev);
++      if (netback_carrier_ok(netif))
++              __netif_down(netif);
++      netif_stop_queue(dev);
++      return 0;
++}
++
++static int netbk_change_mtu(struct net_device *dev, int mtu)
++{
++      int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
++
++      if (mtu > max)
++              return -EINVAL;
++      dev->mtu = mtu;
++      return 0;
++}
++
++void netif_set_features(netif_t *netif)
++{
++      struct net_device *dev = netif->dev;
++      int features = dev->features;
++
++      if (netif->can_sg)
++              features |= NETIF_F_SG;
++      if (netif->gso)
++              features |= NETIF_F_TSO;
++      if (netif->csum)
++              features |= NETIF_F_IP_CSUM;
++
++      features &= ~(netif->features_disabled);
++
++      if (!(features & NETIF_F_SG) && dev->mtu > ETH_DATA_LEN)
++              dev->mtu = ETH_DATA_LEN;
++
++      dev->features = features;
++}
++
++static int netbk_set_tx_csum(struct net_device *dev, u32 data)
++{
++      netif_t *netif = netdev_priv(dev);
++      if (data) {
++              if (!netif->csum)
++                      return -ENOSYS;
++              netif->features_disabled &= ~NETIF_F_IP_CSUM;
++      } else {
++              netif->features_disabled |= NETIF_F_IP_CSUM;
++      }
++
++      netif_set_features(netif);
++      return 0;
++}
++
++static int netbk_set_sg(struct net_device *dev, u32 data)
++{
++      netif_t *netif = netdev_priv(dev);
++      if (data) {
++              if (!netif->can_sg)
++                      return -ENOSYS;
++              netif->features_disabled &= ~NETIF_F_SG;
++      } else {
++              netif->features_disabled |= NETIF_F_SG;
++      }
++
++      netif_set_features(netif);
++      return 0;
++}
++
++static int netbk_set_tso(struct net_device *dev, u32 data)
++{
++      netif_t *netif = netdev_priv(dev);
++      if (data) {
++              if (!netif->gso)
++                      return -ENOSYS;
++              netif->features_disabled &= ~NETIF_F_TSO;
++      } else {
++              netif->features_disabled |= NETIF_F_TSO;
++      }
++
++      netif_set_features(netif);
++      return 0;
++}
++
++static void netbk_get_drvinfo(struct net_device *dev,
++                            struct ethtool_drvinfo *info)
++{
++      strcpy(info->driver, "netbk");
++      strcpy(info->bus_info, dev_name(dev->dev.parent));
++}
++
++static const struct netif_stat {
++      char name[ETH_GSTRING_LEN];
++      u16 offset;
++} netbk_stats[] = {
++      { "copied_skbs", offsetof(netif_t, nr_copied_skbs) / sizeof(long) },
++      { "rx_gso_csum_fixups", offsetof(netif_t, rx_gso_csum_fixups) / sizeof(long) },
++};
++
++static int netbk_get_sset_count(struct net_device *dev, int sset)
++{
++      switch (sset) {
++      case ETH_SS_STATS:
++              return ARRAY_SIZE(netbk_stats);
++      }
++      return -EOPNOTSUPP;
++}
++
++static void netbk_get_ethtool_stats(struct net_device *dev,
++                                 struct ethtool_stats *stats, u64 * data)
++{
++      unsigned long *np = netdev_priv(dev);
++      int i;
++
++      for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
++              data[i] = np[netbk_stats[i].offset];
++}
++
++static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data)
++{
++      int i;
++
++      switch (stringset) {
++      case ETH_SS_STATS:
++              for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
++                      memcpy(data + i * ETH_GSTRING_LEN,
++                             netbk_stats[i].name, ETH_GSTRING_LEN);
++              break;
++      }
++}
++
++static const struct ethtool_ops network_ethtool_ops =
++{
++      .get_drvinfo = netbk_get_drvinfo,
++
++      .get_tx_csum = ethtool_op_get_tx_csum,
++      .set_tx_csum = netbk_set_tx_csum,
++      .get_sg = ethtool_op_get_sg,
++      .set_sg = netbk_set_sg,
++      .get_tso = ethtool_op_get_tso,
++      .set_tso = netbk_set_tso,
++      .get_link = ethtool_op_get_link,
++
++      .get_sset_count = netbk_get_sset_count,
++      .get_ethtool_stats = netbk_get_ethtool_stats,
++      .get_strings = netbk_get_strings,
++};
++
++static const struct net_device_ops netif_be_netdev_ops = {
++      .ndo_open               = net_open,
++      .ndo_stop               = net_close,
++      .ndo_start_xmit         = netif_be_start_xmit,
++      .ndo_change_mtu         = netbk_change_mtu,
++      .ndo_set_mac_address    = eth_mac_addr,
++      .ndo_validate_addr      = eth_validate_addr,
++};
++
++netif_t *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
++{
++      int err = 0;
++      struct net_device *dev;
++      netif_t *netif;
++      char name[IFNAMSIZ] = {};
++
++      snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
++      dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
++      if (dev == NULL) {
++              DPRINTK("Could not create netif: out of memory\n");
++              return ERR_PTR(-ENOMEM);
++      }
++
++      SET_NETDEV_DEV(dev, parent);
++
++      netif = netdev_priv(dev);
++      netif->domid  = domid;
++      netif->group = UINT_MAX;
++      netif->handle = handle;
++      netif->can_sg = 1;
++      netif->csum = 1;
++      atomic_set(&netif->refcnt, 1);
++      init_waitqueue_head(&netif->waiting_to_free);
++      netif->dev = dev;
++
++      netback_carrier_off(netif);
++
++      netif->credit_bytes = netif->remaining_credit = ~0UL;
++      netif->credit_usec  = 0UL;
++      init_timer(&netif->credit_timeout);
++      /* Initialize 'expires' now: it's used to track the credit window. */
++      netif->credit_timeout.expires = jiffies;
++
++      init_timer(&netif->tx_queue_timeout);
++
++      dev->netdev_ops = &netif_be_netdev_ops;
++
++      netif_set_features(netif);
++
++      SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
++
++      dev->tx_queue_len = netbk_queue_length;
++
++      /*
++       * Initialise a dummy MAC address. We choose the numerically
++       * largest non-broadcast address to prevent the address getting
++       * stolen by an Ethernet bridge for STP purposes.
++       * (FE:FF:FF:FF:FF:FF)
++       */ 
++      memset(dev->dev_addr, 0xFF, ETH_ALEN);
++      dev->dev_addr[0] &= ~0x01;
++
++      rtnl_lock();
++      err = register_netdevice(dev);
++      rtnl_unlock();
++      if (err) {
++              DPRINTK("Could not register new net device %s: err=%d\n",
++                      dev->name, err);
++              free_netdev(dev);
++              return ERR_PTR(err);
++      }
++
++      DPRINTK("Successfully created netif\n");
++      return netif;
++}
++
++int netif_map(struct backend_info *be, grant_ref_t tx_ring_ref,
++            grant_ref_t rx_ring_ref, evtchn_port_t evtchn)
++{
++      netif_t *netif = be->netif;
++      struct vm_struct *area;
++      int err = -ENOMEM;
++      netif_tx_sring_t *txs;
++      netif_rx_sring_t *rxs;
++
++      /* Already connected through? */
++      if (netif->irq)
++              return 0;
++
++      area = xenbus_map_ring_valloc(be->dev, tx_ring_ref);
++      if (IS_ERR(area))
++              return PTR_ERR(area);
++      netif->tx_comms_area = area;
++      area = xenbus_map_ring_valloc(be->dev, rx_ring_ref);
++      if (IS_ERR(area)) {
++              err = PTR_ERR(area);
++              goto err_rx;
++      }
++      netif->rx_comms_area = area;
++
++      err = bind_interdomain_evtchn_to_irqhandler(
++              netif->domid, evtchn, netif_be_int, 0,
++              netif->dev->name, netif);
++      if (err < 0)
++              goto err_hypervisor;
++      BUG_ON(err < DYNIRQ_BASE || err >= DYNIRQ_BASE + NR_DYNIRQS);
++      netif->irq = err;
++      disable_irq(netif->irq);
++
++      txs = (netif_tx_sring_t *)netif->tx_comms_area->addr;
++      BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
++
++      rxs = (netif_rx_sring_t *)
++              ((char *)netif->rx_comms_area->addr);
++      BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
++
++      netif->rx_req_cons_peek = 0;
++
++      netif_get(netif);
++
++      rtnl_lock();
++      netback_carrier_on(netif);
++      if (netif_running(netif->dev))
++              __netif_up(netif);
++      rtnl_unlock();
++
++      return 0;
++err_hypervisor:
++      xenbus_unmap_ring_vfree(be->dev, netif->rx_comms_area);
++err_rx:
++      xenbus_unmap_ring_vfree(be->dev, netif->tx_comms_area);
++      return err;
++}
++
++void netif_disconnect(struct backend_info *be)
++{
++      netif_t *netif = be->netif;
++
++      if (netback_carrier_ok(netif)) {
++              rtnl_lock();
++              netback_carrier_off(netif);
++              netif_carrier_off(netif->dev); /* discard queued packets */
++              if (netif_running(netif->dev))
++                      __netif_down(netif);
++              rtnl_unlock();
++              netif_put(netif);
++      }
++
++      atomic_dec(&netif->refcnt);
++      wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
++
++      del_timer_sync(&netif->credit_timeout);
++      del_timer_sync(&netif->tx_queue_timeout);
++
++      if (netif->irq)
++              unbind_from_irqhandler(netif->irq, netif);
++      
++      unregister_netdev(netif->dev);
++
++      if (netif->tx.sring) {
++              xenbus_unmap_ring_vfree(be->dev, netif->tx_comms_area);
++              xenbus_unmap_ring_vfree(be->dev, netif->rx_comms_area);
++      }
++
++      free_netdev(netif->dev);
++}
diff --cc drivers/xen/netback/loopback.c

index 0000000,0000000..8d8ef00

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/netback/loopback.c
@@@ -1,0 -1,0 +1,294 @@@
++/******************************************************************************
++ * netback/loopback.c
++ * 
++ * A two-interface loopback device to emulate a local netfront-netback
++ * connection. This ensures that local packet delivery looks identical
++ * to inter-domain delivery. Most importantly, packets delivered locally
++ * originating from other domains will get *copied* when they traverse this
++ * driver. This prevents unbounded delays in socket-buffer queues from
++ * causing the netback driver to "seize up".
++ * 
++ * This driver creates a symmetric pair of loopback interfaces with names
++ * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet
++ * bridge, just like a proper netback interface, while a local IP interface
++ * is configured on 'veth0'.
++ * 
++ * As with a real netback interface, vif0.0 is configured with a suitable
++ * dummy MAC address. No default is provided for veth0: a reasonable strategy
++ * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address
++ * (to avoid confusing the Etherbridge).
++ * 
++ * Copyright (c) 2005 K A Fraser
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/module.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/skbuff.h>
++#include <linux/ethtool.h>
++#include <net/dst.h>
++#include <net/xfrm.h>         /* secpath_reset() */
++#include <asm/hypervisor.h>   /* is_initial_xendomain() */
++#include <../net/core/kmap_skb.h> /* k{,un}map_skb_frag() */
++
++static int nloopbacks = -1;
++module_param(nloopbacks, int, 0);
++MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create");
++
++struct net_private {
++      struct net_device *loopback_dev;
++      int loop_idx;
++};
++
++static inline struct net_private *loopback_priv(struct net_device *dev)
++{
++      return netdev_priv(dev);
++}
++
++static int loopback_open(struct net_device *dev)
++{
++      memset(&dev->stats, 0, sizeof(dev->stats));
++      netif_start_queue(dev);
++      return 0;
++}
++
++static int loopback_close(struct net_device *dev)
++{
++      netif_stop_queue(dev);
++      return 0;
++}
++
++#ifdef CONFIG_X86
++static int is_foreign(unsigned long pfn)
++{
++      /* NB. Play it safe for auto-translation mode. */
++      return (xen_feature(XENFEAT_auto_translated_physmap) ||
++              (phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT));
++}
++#else
++/* How to detect a foreign mapping? Play it safe. */
++#define is_foreign(pfn)       (1)
++#endif
++
++static int skb_remove_foreign_references(struct sk_buff *skb)
++{
++      struct page *page;
++      unsigned long pfn;
++      int i, off;
++      char *vaddr;
++
++      BUG_ON(skb_shinfo(skb)->frag_list);
++
++      if (skb_cloned(skb) &&
++          unlikely(pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
++              return 0;
++
++      for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++              pfn = page_to_pfn(skb_shinfo(skb)->frags[i].page);
++              if (!is_foreign(pfn))
++                      continue;
++              
++              page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
++              if (unlikely(!page))
++                      return 0;
++
++              vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
++              off = skb_shinfo(skb)->frags[i].page_offset;
++              memcpy(page_address(page) + off,
++                     vaddr + off,
++                     skb_shinfo(skb)->frags[i].size);
++              kunmap_skb_frag(vaddr);
++
++              put_page(skb_shinfo(skb)->frags[i].page);
++              skb_shinfo(skb)->frags[i].page = page;
++      }
++
++      return 1;
++}
++
++static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++      if (!skb_remove_foreign_references(skb)) {
++              dev->stats.tx_dropped++;
++              dev_kfree_skb(skb);
++              return NETDEV_TX_OK;
++      }
++
++      dst_release(skb_dst(skb));
++      skb_dst_set(skb, NULL);
++
++      skb_orphan(skb);
++
++      dev->stats.tx_bytes += skb->len;
++      dev->stats.tx_packets++;
++
++      /* Switch to loopback context. */
++      dev = loopback_priv(dev)->loopback_dev;
++
++      dev->stats.rx_bytes += skb->len;
++      dev->stats.rx_packets++;
++
++      skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
++      skb->protocol = eth_type_trans(skb, dev);
++
++      /* Flush netfilter context: rx'ed skbuffs not expected to have any. */
++      nf_reset(skb);
++      secpath_reset(skb);
++
++      netif_rx(skb);
++
++      return NETDEV_TX_OK;
++}
++
++static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
++{
++      strcpy(info->driver, "netloop");
++      snprintf(info->bus_info, ETHTOOL_BUSINFO_LEN, "vif-0-%d",
++               loopback_priv(dev)->loop_idx);
++}
++
++static const struct ethtool_ops network_ethtool_ops =
++{
++      .get_drvinfo = get_drvinfo,
++
++      .get_tx_csum = ethtool_op_get_tx_csum,
++      .set_tx_csum = ethtool_op_set_tx_csum,
++      .get_sg = ethtool_op_get_sg,
++      .set_sg = ethtool_op_set_sg,
++      .get_tso = ethtool_op_get_tso,
++      .set_tso = ethtool_op_set_tso,
++      .get_link = ethtool_op_get_link,
++};
++
++/*
++ * Nothing to do here. Virtual interface is point-to-point and the
++ * physical interface is probably promiscuous anyway.
++ */
++static void loopback_set_multicast_list(struct net_device *dev)
++{
++}
++
++static const struct net_device_ops loopback_netdev_ops = {
++      .ndo_open               = loopback_open,
++      .ndo_stop               = loopback_close,
++      .ndo_start_xmit         = loopback_start_xmit,
++      .ndo_set_multicast_list = loopback_set_multicast_list,
++      .ndo_change_mtu         = NULL, /* allow arbitrary mtu */
++};
++
++static void loopback_construct(struct net_device *dev, struct net_device *lo,
++                             int loop_idx)
++{
++      struct net_private *np = loopback_priv(dev);
++
++      np->loopback_dev     = lo;
++      np->loop_idx         = loop_idx;
++
++      dev->netdev_ops      = &loopback_netdev_ops;
++      dev->tx_queue_len    = 0;
++
++      dev->features        = (NETIF_F_HIGHDMA |
++                              NETIF_F_LLTX |
++                              NETIF_F_TSO |
++                              NETIF_F_SG |
++                              NETIF_F_IP_CSUM);
++
++      SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
++
++      /*
++       * We do not set a jumbo MTU on the interface. Otherwise the network
++       * stack will try to send large packets that will get dropped by the
++       * Ethernet bridge (unless the physical Ethernet interface is
++       * configured to transfer jumbo packets). If a larger MTU is desired
++       * then the system administrator can specify it using the 'ifconfig'
++       * command.
++       */
++      /*dev->mtu             = 16*1024;*/
++}
++
++static int __init make_loopback(int i)
++{
++      struct net_device *dev1, *dev2;
++      char dev_name[IFNAMSIZ];
++      int err = -ENOMEM;
++
++      sprintf(dev_name, "vif0.%d", i);
++      dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
++      if (!dev1)
++              return err;
++
++      sprintf(dev_name, "veth%d", i);
++      dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
++      if (!dev2)
++              goto fail_netdev2;
++
++      loopback_construct(dev1, dev2, i);
++      loopback_construct(dev2, dev1, i);
++
++      /*
++       * Initialise a dummy MAC address for the 'dummy backend' interface. We
++       * choose the numerically largest non-broadcast address to prevent the
++       * address getting stolen by an Ethernet bridge for STP purposes.
++       */
++      memset(dev1->dev_addr, 0xFF, ETH_ALEN);
++      dev1->dev_addr[0] &= ~0x01;
++
++      if ((err = register_netdev(dev1)) != 0)
++              goto fail;
++
++      if ((err = register_netdev(dev2)) != 0) {
++              unregister_netdev(dev1);
++              goto fail;
++      }
++
++      return 0;
++
++ fail:
++      free_netdev(dev2);
++ fail_netdev2:
++      free_netdev(dev1);
++      return err;
++}
++
++static int __init loopback_init(void)
++{
++      int i, err = 0;
++
++      if (nloopbacks == -1)
++              nloopbacks = is_initial_xendomain() ? 4 : 0;
++
++      for (i = 0; i < nloopbacks; i++)
++              if ((err = make_loopback(i)) != 0)
++                      break;
++
++      return err;
++}
++
++module_init(loopback_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/netback/netback.c

index 0000000,0000000..6627c35

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/netback/netback.c
@@@ -1,0 -1,0 +1,1873 @@@
++/******************************************************************************
++ * drivers/xen/netback/netback.c
++ * 
++ * Back-end of the driver for virtual network devices. This portion of the
++ * driver exports a 'unified' network-device interface that can be accessed
++ * by any operating system that implements a compatible front end. A 
++ * reference front-end implementation can be found in:
++ *  drivers/xen/netfront/netfront.c
++ * 
++ * Copyright (c) 2002-2005, K A Fraser
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++#include <linux/if_vlan.h>
++#include <linux/kthread.h>
++#include <linux/vmalloc.h>
++#include <net/tcp.h>
++#include <xen/balloon.h>
++#include <xen/evtchn.h>
++#include <xen/gnttab.h>
++#include <xen/interface/memory.h>
++#include <xen/net-util.h>
++
++/*define NETBE_DEBUG_INTERRUPT*/
++
++struct xen_netbk *__read_mostly xen_netbk;
++unsigned int __read_mostly netbk_nr_groups;
++static bool __read_mostly use_kthreads = true;
++static bool __initdata bind_threads;
++
++#define GET_GROUP_INDEX(netif) ((netif)->group)
++
++static void netif_idx_release(struct xen_netbk *, u16 pending_idx);
++static void make_tx_response(netif_t *netif, 
++                           netif_tx_request_t *txp,
++                           s8       st);
++static netif_rx_response_t *make_rx_response(netif_t *netif, 
++                                           u16      id, 
++                                           s8       st,
++                                           u16      offset,
++                                           u16      size,
++                                           u16      flags);
++
++static void net_tx_action(unsigned long group);
++static void net_rx_action(unsigned long group);
++
++static inline unsigned long idx_to_pfn(struct xen_netbk *netbk, unsigned int idx)
++{
++      return page_to_pfn(netbk->mmap_pages[idx]);
++}
++
++static inline unsigned long idx_to_kaddr(struct xen_netbk *netbk, unsigned int idx)
++{
++      return (unsigned long)pfn_to_kaddr(idx_to_pfn(netbk, idx));
++}
++
++/* extra field used in struct page */
++union page_ext {
++      struct {
++#if BITS_PER_LONG < 64
++#define GROUP_WIDTH (BITS_PER_LONG - CONFIG_XEN_NETDEV_TX_SHIFT)
++#define MAX_GROUPS ((1U << GROUP_WIDTH) - 1)
++              unsigned int grp:GROUP_WIDTH;
++              unsigned int idx:CONFIG_XEN_NETDEV_TX_SHIFT;
++#else
++#define MAX_GROUPS UINT_MAX
++              unsigned int grp, idx;
++#endif
++      } e;
++      void *mapping;
++};
++
++static inline void netif_set_page_ext(struct page *pg, unsigned int group,
++                                    unsigned int idx)
++{
++      union page_ext ext = { .e = { .grp = group + 1, .idx = idx } };
++
++      BUILD_BUG_ON(sizeof(ext) > sizeof(ext.mapping));
++      pg->mapping = ext.mapping;
++}
++
++static inline unsigned int netif_page_group(const struct page *pg)
++{
++      union page_ext ext = { .mapping = pg->mapping };
++
++      return ext.e.grp - 1;
++}
++
++static inline unsigned int netif_page_index(const struct page *pg)
++{
++      union page_ext ext = { .mapping = pg->mapping };
++
++      return ext.e.idx;
++}
++
++/*
++ * This is the amount of packet we copy rather than map, so that the
++ * guest can't fiddle with the contents of the headers while we do
++ * packet processing on them (netfilter, routing, etc).
++ */
++#define PKT_PROT_LEN    (ETH_HLEN + VLAN_HLEN + \
++                       sizeof(struct iphdr) + MAX_IPOPTLEN + \
++                       sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)
++
++#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
++
++static inline pending_ring_idx_t nr_pending_reqs(const struct xen_netbk *netbk)
++{
++      return MAX_PENDING_REQS -
++              netbk->pending_prod + netbk->pending_cons;
++}
++
++/* Setting this allows the safe use of this driver without netloop. */
++static int MODPARM_copy_skb = 1;
++module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
++MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
++static int MODPARM_permute_returns = 0;
++module_param_named(permute_returns, MODPARM_permute_returns, bool, S_IRUSR|S_IWUSR);
++MODULE_PARM_DESC(permute_returns, "Randomly permute the order in which TX responses are sent to the frontend");
++module_param_named(groups, netbk_nr_groups, uint, 0);
++MODULE_PARM_DESC(groups, "Specify the number of tasklet pairs/threads to use");
++module_param_named(tasklets, use_kthreads, invbool, 0);
++MODULE_PARM_DESC(tasklets, "Use tasklets instead of kernel threads");
++module_param_named(bind, bind_threads, bool, 0);
++MODULE_PARM_DESC(bind, "Bind kernel threads to (v)CPUs");
++
++int netbk_copy_skb_mode;
++
++static inline unsigned long alloc_mfn(struct xen_netbk *netbk)
++{
++      BUG_ON(netbk->alloc_index == 0);
++      return netbk->mfn_list[--netbk->alloc_index];
++}
++
++static int check_mfn(struct xen_netbk *netbk, unsigned int nr)
++{
++      struct xen_memory_reservation reservation = {
++              .extent_order = 0,
++              .domid        = DOMID_SELF
++      };
++      int rc;
++
++      if (likely(netbk->alloc_index >= nr))
++              return 0;
++
++      set_xen_guest_handle(reservation.extent_start,
++                           netbk->mfn_list + netbk->alloc_index);
++      reservation.nr_extents = MAX_MFN_ALLOC - netbk->alloc_index;
++      rc = HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation);
++      if (likely(rc > 0))
++              netbk->alloc_index += rc;
++
++      return netbk->alloc_index >= nr ? 0 : -ENOMEM;
++}
++
++static void netbk_schedule(struct xen_netbk *netbk)
++{
++      if (use_kthreads)
++              wake_up(&netbk->netbk_action_wq);
++      else
++              tasklet_schedule(&netbk->net_tx_tasklet);
++}
++
++static void netbk_schedule_group(unsigned long group)
++{
++      netbk_schedule(&xen_netbk[group]);
++}
++
++static inline void maybe_schedule_tx_action(unsigned int group)
++{
++      struct xen_netbk *netbk = &xen_netbk[group];
++
++      smp_mb();
++      if ((nr_pending_reqs(netbk) < (MAX_PENDING_REQS/2)) &&
++          !list_empty(&netbk->schedule_list))
++              netbk_schedule(netbk);
++}
++
++static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
++{
++      struct skb_shared_info *ninfo;
++      struct sk_buff *nskb;
++      unsigned long offset;
++      int ret;
++      int len;
++      int headlen;
++
++      BUG_ON(skb_shinfo(skb)->frag_list != NULL);
++
++      nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
++      if (unlikely(!nskb))
++              goto err;
++
++      skb_reserve(nskb, 16 + NET_IP_ALIGN);
++      headlen = skb_end_pointer(nskb) - nskb->data;
++      if (headlen > skb_headlen(skb))
++              headlen = skb_headlen(skb);
++      ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
++      BUG_ON(ret);
++
++      ninfo = skb_shinfo(nskb);
++      ninfo->gso_size = skb_shinfo(skb)->gso_size;
++      ninfo->gso_type = skb_shinfo(skb)->gso_type;
++
++      offset = headlen;
++      len = skb->len - headlen;
++
++      nskb->len = skb->len;
++      nskb->data_len = len;
++      nskb->truesize += len;
++
++      while (len) {
++              struct page *page;
++              int copy;
++              int zero;
++
++              if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
++                      dump_stack();
++                      goto err_free;
++              }
++
++              copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
++              zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
++
++              page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
++              if (unlikely(!page))
++                      goto err_free;
++
++              ret = skb_copy_bits(skb, offset, page_address(page), copy);
++              BUG_ON(ret);
++
++              ninfo->frags[ninfo->nr_frags].page = page;
++              ninfo->frags[ninfo->nr_frags].page_offset = 0;
++              ninfo->frags[ninfo->nr_frags].size = copy;
++              ninfo->nr_frags++;
++
++              offset += copy;
++              len -= copy;
++      }
++
++#ifdef NET_SKBUFF_DATA_USES_OFFSET
++      offset = 0;
++#else
++      offset = nskb->data - skb->data;
++#endif
++
++      nskb->transport_header = skb->transport_header + offset;
++      nskb->network_header   = skb->network_header   + offset;
++      nskb->mac_header       = skb->mac_header       + offset;
++
++      return nskb;
++
++ err_free:
++      kfree_skb(nskb);
++ err:
++      return NULL;
++}
++
++static inline int netbk_max_required_rx_slots(netif_t *netif)
++{
++      if (netif->can_sg || netif->gso)
++              return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
++      return 1; /* all in one */
++}
++
++static inline int netbk_queue_full(netif_t *netif)
++{
++      RING_IDX peek   = netif->rx_req_cons_peek;
++      RING_IDX needed = netbk_max_required_rx_slots(netif);
++
++      return ((netif->rx.sring->req_prod - peek) < needed) ||
++             ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
++}
++
++static void tx_queue_callback(unsigned long data)
++{
++      netif_t *netif = (netif_t *)data;
++      if (netif_schedulable(netif))
++              netif_wake_queue(netif->dev);
++}
++
++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++      netif_t *netif = netdev_priv(dev);
++      struct xen_netbk *netbk;
++
++      BUG_ON(skb->dev != dev);
++
++      /* Drop the packet if the target domain has no receive buffers. */
++      if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif)))
++              goto drop;
++
++      /*
++       * Copy the packet here if it's destined for a flipping interface
++       * but isn't flippable (e.g. extra references to data).
++       * XXX For now we also copy skbuffs whose head crosses a page
++       * boundary, because netbk_gop_skb can't handle them.
++       */
++      if (!netif->copying_receiver ||
++          ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE)) {
++              struct sk_buff *nskb = netbk_copy_skb(skb);
++              if ( unlikely(nskb == NULL) )
++                      goto drop;
++              /* Copy only the header fields we use in this driver. */
++              nskb->dev = skb->dev;
++              nskb->ip_summed = skb->ip_summed;
++              dev_kfree_skb(skb);
++              skb = nskb;
++      }
++
++      netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
++                                 !!skb_shinfo(skb)->gso_size;
++      netif_get(netif);
++
++      if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
++              netif->rx.sring->req_event = netif->rx_req_cons_peek +
++                      netbk_max_required_rx_slots(netif);
++              mb(); /* request notification /then/ check & stop the queue */
++              if (netbk_queue_full(netif)) {
++                      netif_stop_queue(dev);
++                      /*
++                       * Schedule 500ms timeout to restart the queue, thus
++                       * ensuring that an inactive queue will be drained.
++                       * Packets will be immediately be dropped until more
++                       * receive buffers become available (see
++                       * netbk_queue_full() check above).
++                       */
++                      netif->tx_queue_timeout.data = (unsigned long)netif;
++                      netif->tx_queue_timeout.function = tx_queue_callback;
++                      mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
++              }
++      }
++
++      netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
++      skb_queue_tail(&netbk->rx_queue, skb);
++      netbk_schedule(netbk);
++
++      return NETDEV_TX_OK;
++
++ drop:
++      dev->stats.tx_dropped++;
++      dev_kfree_skb(skb);
++      return NETDEV_TX_OK;
++}
++
++#if 0
++static void xen_network_done_notify(void)
++{
++      static struct net_device *eth0_dev = NULL;
++      if (unlikely(eth0_dev == NULL))
++              eth0_dev = __dev_get_by_name(&init_net, "eth0");
++      napi_schedule(???);
++}
++/* 
++ * Add following to poll() function in NAPI driver (Tigon3 is example):
++ *  if ( xen_network_done() )
++ *      tg3_enable_ints(tp);
++ */
++int xen_network_done(void)
++{
++      return skb_queue_empty(&rx_queue);
++}
++#endif
++
++struct netrx_pending_operations {
++      unsigned trans_prod, trans_cons;
++      unsigned mmu_prod, mmu_mcl;
++      unsigned mcl_prod, mcl_cons;
++      unsigned copy_prod, copy_cons;
++      unsigned meta_prod, meta_cons;
++      mmu_update_t *mmu;
++      gnttab_transfer_t *trans;
++      gnttab_copy_t *copy;
++      multicall_entry_t *mcl;
++      struct netbk_rx_meta *meta;
++};
++
++/* Set up the grant operations for this fragment.  If it's a flipping
++   interface, we also set up the unmap request from here. */
++static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
++                        int i, struct netrx_pending_operations *npo,
++                        struct page *page, unsigned long size,
++                        unsigned long offset)
++{
++      mmu_update_t *mmu;
++      gnttab_transfer_t *gop;
++      gnttab_copy_t *copy_gop;
++      multicall_entry_t *mcl;
++      netif_rx_request_t *req;
++      unsigned long old_mfn, new_mfn;
++      struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
++
++      old_mfn = virt_to_mfn(page_address(page));
++
++      req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
++      if (netif->copying_receiver) {
++              unsigned int group, idx;
++
++              /* The fragment needs to be copied rather than
++                 flipped. */
++              meta->copy = 1;
++              copy_gop = npo->copy + npo->copy_prod++;
++              copy_gop->flags = GNTCOPY_dest_gref;
++              if (PageForeign(page) &&
++                  page->mapping != NULL &&
++                  (idx = netif_page_index(page)) < MAX_PENDING_REQS &&
++                  (group = netif_page_group(page)) < netbk_nr_groups) {
++                      struct pending_tx_info *src_pend;
++                      unsigned int grp;
++
++                      netbk = &xen_netbk[group];
++                      BUG_ON(netbk->mmap_pages[idx] != page);
++                      src_pend = &netbk->pending_tx_info[idx];
++                      grp = GET_GROUP_INDEX(src_pend->netif);
++                      BUG_ON(group != grp && grp != UINT_MAX);
++                      copy_gop->source.domid = src_pend->netif->domid;
++                      copy_gop->source.u.ref = src_pend->req.gref;
++                      copy_gop->flags |= GNTCOPY_source_gref;
++              } else {
++                      copy_gop->source.domid = DOMID_SELF;
++                      copy_gop->source.u.gmfn = old_mfn;
++              }
++              copy_gop->source.offset = offset;
++              copy_gop->dest.domid = netif->domid;
++              copy_gop->dest.offset = 0;
++              copy_gop->dest.u.ref = req->gref;
++              copy_gop->len = size;
++      } else {
++              meta->copy = 0;
++              if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++                      new_mfn = alloc_mfn(netbk);
++
++                      /*
++                       * Set the new P2M table entry before
++                       * reassigning the old data page. Heed the
++                       * comment in pgtable-2level.h:pte_page(). :-)
++                       */
++                      set_phys_to_machine(page_to_pfn(page), new_mfn);
++
++                      mcl = npo->mcl + npo->mcl_prod++;
++                      MULTI_update_va_mapping(mcl,
++                                           (unsigned long)page_address(page),
++                                           pfn_pte_ma(new_mfn, PAGE_KERNEL),
++                                           0);
++
++                      mmu = npo->mmu + npo->mmu_prod++;
++                      mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
++                              MMU_MACHPHYS_UPDATE;
++                      mmu->val = page_to_pfn(page);
++              }
++
++              gop = npo->trans + npo->trans_prod++;
++              gop->mfn = old_mfn;
++              gop->domid = netif->domid;
++              gop->ref = req->gref;
++      }
++      return req->id;
++}
++
++static void netbk_gop_skb(struct sk_buff *skb,
++                        struct netrx_pending_operations *npo)
++{
++      netif_t *netif = netdev_priv(skb->dev);
++      int nr_frags = skb_shinfo(skb)->nr_frags;
++      int i;
++      int extra;
++      struct netbk_rx_meta *head_meta, *meta;
++
++      head_meta = npo->meta + npo->meta_prod++;
++      head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
++      head_meta->frag.size = skb_shinfo(skb)->gso_size;
++      extra = !!head_meta->frag.size + 1;
++
++      for (i = 0; i < nr_frags; i++) {
++              meta = npo->meta + npo->meta_prod++;
++              meta->frag = skb_shinfo(skb)->frags[i];
++              meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
++                                        meta->frag.page,
++                                        meta->frag.size,
++                                        meta->frag.page_offset);
++      }
++
++      /*
++       * This must occur at the end to ensure that we don't trash skb_shinfo
++       * until we're done. We know that the head doesn't cross a page
++       * boundary because such packets get copied in netif_be_start_xmit.
++       */
++      head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
++                                     virt_to_page(skb->data),
++                                     skb_headlen(skb),
++                                     offset_in_page(skb->data));
++
++      netif->rx.req_cons += nr_frags + extra;
++}
++
++static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
++{
++      int i;
++
++      for (i = 0; i < nr_frags; i++)
++              put_page(meta[i].frag.page);
++}
++
++/* This is a twin to netbk_gop_skb.  Assume that netbk_gop_skb was
++   used to set up the operations on the top of
++   netrx_pending_operations, which have since been done.  Check that
++   they didn't give any errors and advance over them. */
++static int netbk_check_gop(int nr_frags, domid_t domid, struct netrx_pending_operations *npo)
++{
++      multicall_entry_t *mcl;
++      gnttab_transfer_t *gop;
++      gnttab_copy_t     *copy_op;
++      int status = XEN_NETIF_RSP_OKAY;
++      int i;
++
++      for (i = 0; i <= nr_frags; i++) {
++              if (npo->meta[npo->meta_cons + i].copy) {
++                      copy_op = npo->copy + npo->copy_cons++;
++                      if (unlikely(copy_op->status == GNTST_eagain))
++                              gnttab_check_GNTST_eagain_while(GNTTABOP_copy, copy_op);
++                      if (unlikely(copy_op->status != GNTST_okay)) {
++                              DPRINTK("Bad status %d from copy to DOM%d.\n",
++                                      copy_op->status, domid);
++                              status = XEN_NETIF_RSP_ERROR;
++                      }
++              } else {
++                      if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++                              mcl = npo->mcl + npo->mcl_cons++;
++                              /* The update_va_mapping() must not fail. */
++                              BUG_ON(mcl->result != 0);
++                      }
++
++                      gop = npo->trans + npo->trans_cons++;
++                      /* Check the reassignment error code. */
++                      if (unlikely(gop->status != GNTST_okay)) {
++                              DPRINTK("Bad status %d from grant transfer to DOM%u\n",
++                                      gop->status, domid);
++                              /*
++                               * Page no longer belongs to us unless
++                               * GNTST_bad_page, but that should be
++                               * a fatal error anyway.
++                               */
++                              BUG_ON(gop->status == GNTST_bad_page);
++                              status = XEN_NETIF_RSP_ERROR;
++                      }
++              }
++      }
++
++      return status;
++}
++
++static void netbk_add_frag_responses(netif_t *netif, int status,
++                                   struct netbk_rx_meta *meta, int nr_frags)
++{
++      int i;
++      unsigned long offset;
++
++      for (i = 0; i < nr_frags; i++) {
++              int id = meta[i].id;
++              int flags = (i == nr_frags - 1) ? 0 : XEN_NETRXF_more_data;
++
++              if (meta[i].copy)
++                      offset = 0;
++              else
++                      offset = meta[i].frag.page_offset;
++              make_rx_response(netif, id, status, offset,
++                               meta[i].frag.size, flags);
++      }
++}
++
++static void net_rx_action(unsigned long group)
++{
++      netif_t *netif = NULL;
++      s8 status;
++      u16 id, irq, flags;
++      netif_rx_response_t *resp;
++      multicall_entry_t *mcl;
++      struct sk_buff_head rxq;
++      struct sk_buff *skb;
++      int notify_nr = 0;
++      int ret;
++      int nr_frags;
++      int count;
++      unsigned long offset;
++      struct xen_netbk *netbk = &xen_netbk[group];
++
++      struct netrx_pending_operations npo = {
++              .mmu   = netbk->rx_mmu,
++              .trans = netbk->grant_trans_op,
++              .copy  = netbk->grant_copy_op,
++              .mcl   = netbk->rx_mcl,
++              .meta  = netbk->meta,
++      };
++
++      skb_queue_head_init(&rxq);
++
++      count = 0;
++
++      while ((skb = skb_dequeue(&netbk->rx_queue)) != NULL) {
++              nr_frags = skb_shinfo(skb)->nr_frags;
++              *(int *)skb->cb = nr_frags;
++
++              if (!xen_feature(XENFEAT_auto_translated_physmap) &&
++                  !((netif_t *)netdev_priv(skb->dev))->copying_receiver &&
++                  check_mfn(netbk, nr_frags + 1)) {
++                      /* Memory squeeze? Back off for an arbitrary while. */
++                      if ( net_ratelimit() )
++                              WPRINTK("Memory squeeze in netback "
++                                      "driver.\n");
++                      mod_timer(&netbk->net_timer, jiffies + HZ);
++                      skb_queue_head(&netbk->rx_queue, skb);
++                      break;
++              }
++
++              netbk_gop_skb(skb, &npo);
++
++              count += nr_frags + 1;
++
++              __skb_queue_tail(&rxq, skb);
++
++              /* Filled the batch queue? */
++              if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
++                      break;
++      }
++
++      BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk->meta));
++
++      npo.mmu_mcl = npo.mcl_prod;
++      if (npo.mcl_prod) {
++              BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++              BUG_ON(npo.mmu_prod > ARRAY_SIZE(netbk->rx_mmu));
++              mcl = npo.mcl + npo.mcl_prod++;
++
++              BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
++              mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
++
++              mcl->op = __HYPERVISOR_mmu_update;
++              mcl->args[0] = (unsigned long)netbk->rx_mmu;
++              mcl->args[1] = npo.mmu_prod;
++              mcl->args[2] = 0;
++              mcl->args[3] = DOMID_SELF;
++      }
++
++      if (npo.trans_prod) {
++              BUG_ON(npo.trans_prod > ARRAY_SIZE(netbk->grant_trans_op));
++              mcl = npo.mcl + npo.mcl_prod++;
++              mcl->op = __HYPERVISOR_grant_table_op;
++              mcl->args[0] = GNTTABOP_transfer;
++              mcl->args[1] = (unsigned long)netbk->grant_trans_op;
++              mcl->args[2] = npo.trans_prod;
++      }
++
++      if (npo.copy_prod) {
++              BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op));
++              mcl = npo.mcl + npo.mcl_prod++;
++              mcl->op = __HYPERVISOR_grant_table_op;
++              mcl->args[0] = GNTTABOP_copy;
++              mcl->args[1] = (unsigned long)netbk->grant_copy_op;
++              mcl->args[2] = npo.copy_prod;
++      }
++
++      /* Nothing to do? */
++      if (!npo.mcl_prod)
++              return;
++
++      BUG_ON(npo.mcl_prod > ARRAY_SIZE(netbk->rx_mcl));
++
++      ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
++      BUG_ON(ret != 0);
++      /* The mmu_machphys_update() must not fail. */
++      BUG_ON(npo.mmu_mcl && npo.mcl[npo.mmu_mcl].result != 0);
++
++      while ((skb = __skb_dequeue(&rxq)) != NULL) {
++              nr_frags = *(int *)skb->cb;
++
++              netif = netdev_priv(skb->dev);
++
++              status = netbk_check_gop(nr_frags, netif->domid, &npo);
++
++              /* We can't rely on skb_release_data to release the
++                 pages used by fragments for us, since it tries to
++                 touch the pages in the fraglist.  If we're in
++                 flipping mode, that doesn't work.  In copying mode,
++                 we still have access to all of the pages, and so
++                 it's safe to let release_data deal with it. */
++              /* (Freeing the fragments is safe since we copy
++                 non-linear skbs destined for flipping interfaces) */
++              if (!netif->copying_receiver) {
++                      atomic_set(&(skb_shinfo(skb)->dataref), 1);
++                      skb_shinfo(skb)->frag_list = NULL;
++                      skb_shinfo(skb)->nr_frags = 0;
++                      netbk_free_pages(nr_frags, netbk->meta + npo.meta_cons + 1);
++              }
++
++              skb->dev->stats.tx_bytes += skb->len;
++              skb->dev->stats.tx_packets++;
++
++              id = netbk->meta[npo.meta_cons].id;
++              flags = nr_frags ? XEN_NETRXF_more_data : 0;
++
++              switch (skb->ip_summed) {
++              case CHECKSUM_PARTIAL: /* local packet? */
++                      flags |= XEN_NETRXF_csum_blank |
++                               XEN_NETRXF_data_validated;
++                      break;
++              case CHECKSUM_UNNECESSARY: /* remote but checksummed? */
++                      flags |= XEN_NETRXF_data_validated;
++                      break;
++              }
++
++              if (netbk->meta[npo.meta_cons].copy)
++                      offset = 0;
++              else
++                      offset = offset_in_page(skb->data);
++              resp = make_rx_response(netif, id, status, offset,
++                                      skb_headlen(skb), flags);
++
++              if (netbk->meta[npo.meta_cons].frag.size) {
++                      struct netif_extra_info *gso =
++                              (struct netif_extra_info *)
++                              RING_GET_RESPONSE(&netif->rx,
++                                                netif->rx.rsp_prod_pvt++);
++
++                      resp->flags |= XEN_NETRXF_extra_info;
++
++                      gso->u.gso.size = netbk->meta[npo.meta_cons].frag.size;
++                      gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
++                      gso->u.gso.pad = 0;
++                      gso->u.gso.features = 0;
++
++                      gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
++                      gso->flags = 0;
++              }
++
++              netbk_add_frag_responses(netif, status,
++                                       netbk->meta + npo.meta_cons + 1,
++                                       nr_frags);
++
++              RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
++              irq = netif->irq - DYNIRQ_BASE;
++              if (ret && !__test_and_set_bit(irq, netbk->rx_notify))
++                      netbk->notify_list[notify_nr++] = irq;
++
++              if (netif_queue_stopped(netif->dev) &&
++                  netif_schedulable(netif) &&
++                  !netbk_queue_full(netif))
++                      netif_wake_queue(netif->dev);
++
++              netif_put(netif);
++              dev_kfree_skb(skb);
++
++              npo.meta_cons += nr_frags + 1;
++      }
++
++      if (notify_nr == 1) {
++              irq = *netbk->notify_list;
++              __clear_bit(irq, netbk->rx_notify);
++              notify_remote_via_irq(irq + DYNIRQ_BASE);
++      } else {
++              for (count = ret = 0; ret < notify_nr; ++ret) {
++                      irq = netbk->notify_list[ret];
++                      __clear_bit(irq, netbk->rx_notify);
++                      if (!multi_notify_remote_via_irq(netbk->rx_mcl + count,
++                                                       irq + DYNIRQ_BASE))
++                              ++count;
++              }
++              if (HYPERVISOR_multicall(netbk->rx_mcl, count))
++                      BUG();
++      }
++
++      /* More work to do? */
++      if (!skb_queue_empty(&netbk->rx_queue) &&
++          !timer_pending(&netbk->net_timer))
++              netbk_schedule(netbk);
++#if 0
++      else
++              xen_network_done_notify();
++#endif
++}
++
++static int __on_net_schedule_list(netif_t *netif)
++{
++      return netif->list.next != NULL;
++}
++
++/* Must be called with netbk->schedule_list_lock held. */
++static void remove_from_net_schedule_list(netif_t *netif)
++{
++      if (likely(__on_net_schedule_list(netif))) {
++              list_del(&netif->list);
++              netif->list.next = NULL;
++              netif_put(netif);
++      }
++}
++
++static netif_t *poll_net_schedule_list(struct xen_netbk *netbk)
++{
++      netif_t *netif = NULL;
++
++      spin_lock_irq(&netbk->schedule_list_lock);
++      if (!list_empty(&netbk->schedule_list)) {
++              netif = list_first_entry(&netbk->schedule_list, netif_t, list);
++              netif_get(netif);
++              remove_from_net_schedule_list(netif);
++      }
++      spin_unlock_irq(&netbk->schedule_list_lock);
++      return netif;
++}
++
++static void add_to_net_schedule_list_tail(netif_t *netif)
++{
++      struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
++      unsigned long flags;
++
++      if (__on_net_schedule_list(netif))
++              return;
++
++      spin_lock_irqsave(&netbk->schedule_list_lock, flags);
++      if (!__on_net_schedule_list(netif) &&
++          likely(netif_schedulable(netif))) {
++              list_add_tail(&netif->list, &netbk->schedule_list);
++              netif_get(netif);
++      }
++      spin_unlock_irqrestore(&netbk->schedule_list_lock, flags);
++}
++
++/*
++ * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
++ * If this driver is pipelining transmit requests then we can be very
++ * aggressive in avoiding new-packet notifications -- frontend only needs to
++ * send a notification if there are no outstanding unreceived responses.
++ * If we may be buffer transmit buffers for any reason then we must be rather
++ * more conservative and treat this as the final check for pending work.
++ */
++void netif_schedule_work(netif_t *netif)
++{
++      int more_to_do;
++
++#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
++      more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
++#else
++      RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
++#endif
++
++      if (more_to_do) {
++              add_to_net_schedule_list_tail(netif);
++              maybe_schedule_tx_action(GET_GROUP_INDEX(netif));
++      }
++}
++
++void netif_deschedule_work(netif_t *netif)
++{
++      struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
++
++      spin_lock_irq(&netbk->schedule_list_lock);
++      remove_from_net_schedule_list(netif);
++      spin_unlock_irq(&netbk->schedule_list_lock);
++}
++
++
++static void tx_add_credit(netif_t *netif)
++{
++      unsigned long max_burst, max_credit;
++
++      /*
++       * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
++       * Otherwise the interface can seize up due to insufficient credit.
++       */
++      max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
++      max_burst = min(max_burst, 131072UL);
++      max_burst = max(max_burst, netif->credit_bytes);
++
++      /* Take care that adding a new chunk of credit doesn't wrap to zero. */
++      max_credit = netif->remaining_credit + netif->credit_bytes;
++      if (max_credit < netif->remaining_credit)
++              max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
++
++      netif->remaining_credit = min(max_credit, max_burst);
++}
++
++static void tx_credit_callback(unsigned long data)
++{
++      netif_t *netif = (netif_t *)data;
++      tx_add_credit(netif);
++      netif_schedule_work(netif);
++}
++
++static inline int copy_pending_req(struct xen_netbk *netbk,
++                                 pending_ring_idx_t pending_idx)
++{
++      return gnttab_copy_grant_page(netbk->grant_tx_handle[pending_idx],
++                                    &netbk->mmap_pages[pending_idx]);
++}
++
++static void permute_dealloc_ring(u16 *dealloc_ring, pending_ring_idx_t dc,
++                               pending_ring_idx_t dp)
++{
++      static unsigned random_src = 0x12345678;
++      unsigned dst_offset;
++      pending_ring_idx_t dest;
++      u16 tmp;
++
++      while (dc != dp) {
++              dst_offset = (random_src / 256) % (dp - dc);
++              dest = dc + dst_offset;
++              tmp = dealloc_ring[MASK_PEND_IDX(dest)];
++              dealloc_ring[MASK_PEND_IDX(dest)] =
++                      dealloc_ring[MASK_PEND_IDX(dc)];
++              dealloc_ring[MASK_PEND_IDX(dc)] = tmp;
++              dc++;
++              random_src *= 68389;
++      }
++}
++
++inline static void net_tx_action_dealloc(struct xen_netbk *netbk)
++{
++      struct netbk_tx_pending_inuse *inuse, *n;
++      gnttab_unmap_grant_ref_t *gop;
++      u16 pending_idx;
++      pending_ring_idx_t dc, dp;
++      netif_t *netif;
++      LIST_HEAD(list);
++
++      dc = netbk->dealloc_cons;
++      gop = netbk->tx_unmap_ops;
++
++      /*
++       * Free up any grants we have finished using
++       */
++      do {
++              dp = netbk->dealloc_prod;
++
++              /* Ensure we see all indices enqueued by netif_idx_release(). */
++              smp_rmb();
++
++              if (MODPARM_permute_returns)
++                      permute_dealloc_ring(netbk->dealloc_ring, dc, dp);
++
++              while (dc != dp) {
++                      unsigned long pfn;
++                      struct netbk_tx_pending_inuse *pending_inuse =
++                                      netbk->pending_inuse;
++
++                      pending_idx = netbk->dealloc_ring[MASK_PEND_IDX(dc++)];
++                      list_move_tail(&pending_inuse[pending_idx].list, &list);
++
++                      pfn = idx_to_pfn(netbk, pending_idx);
++                      /* Already unmapped? */
++                      if (!phys_to_machine_mapping_valid(pfn))
++                              continue;
++
++                      gnttab_set_unmap_op(gop, idx_to_kaddr(netbk, pending_idx),
++                                          GNTMAP_host_map,
++                                          netbk->grant_tx_handle[pending_idx]);
++                      gop++;
++              }
++
++      } while (dp != netbk->dealloc_prod);
++
++      netbk->dealloc_cons = dc;
++
++      if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++                                    netbk->tx_unmap_ops,
++                                    gop - netbk->tx_unmap_ops))
++              BUG();
++
++      /* Copy any entries that have been pending for too long. */
++      if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
++          !list_empty(&netbk->pending_inuse_head)) {
++              list_for_each_entry_safe(inuse, n, &netbk->pending_inuse_head, list) {
++                      struct pending_tx_info *pending_tx_info
++                              = netbk->pending_tx_info;
++
++                      if (time_after(inuse->alloc_time + HZ / 2, jiffies))
++                              break;
++
++                      pending_idx = inuse - netbk->pending_inuse;
++
++                      pending_tx_info[pending_idx].netif->nr_copied_skbs++;
++
++                      switch (copy_pending_req(netbk, pending_idx)) {
++                      case 0:
++                              list_move_tail(&inuse->list, &list);
++                              continue;
++                      case -EBUSY:
++                              list_del_init(&inuse->list);
++                              continue;
++                      case -ENOENT:
++                              continue;
++                      }
++
++                      break;
++              }
++      }
++
++      list_for_each_entry_safe(inuse, n, &list, list) {
++              struct pending_tx_info *pending_tx_info =
++                      netbk->pending_tx_info;
++
++              pending_idx = inuse - netbk->pending_inuse;
++              netif = pending_tx_info[pending_idx].netif;
++
++              make_tx_response(netif, &pending_tx_info[pending_idx].req, 
++                               XEN_NETIF_RSP_OKAY);
++
++              /* Ready for next use. */
++              gnttab_reset_grant_page(netbk->mmap_pages[pending_idx]);
++
++              netbk->pending_ring[MASK_PEND_IDX(netbk->pending_prod++)] =
++                      pending_idx;
++
++              netif_put(netif);
++
++              list_del_init(&inuse->list);
++      }
++}
++
++static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
++{
++      RING_IDX cons = netif->tx.req_cons;
++
++      do {
++              make_tx_response(netif, txp, XEN_NETIF_RSP_ERROR);
++              if (cons >= end)
++                      break;
++              txp = RING_GET_REQUEST(&netif->tx, cons++);
++      } while (1);
++      netif->tx.req_cons = cons;
++      netif_schedule_work(netif);
++      netif_put(netif);
++}
++
++static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first,
++                              netif_tx_request_t *txp, int work_to_do)
++{
++      RING_IDX cons = netif->tx.req_cons;
++      int frags = 0;
++
++      if (!(first->flags & XEN_NETTXF_more_data))
++              return 0;
++
++      do {
++              if (frags >= work_to_do) {
++                      DPRINTK("Need more frags\n");
++                      return -frags;
++              }
++
++              if (unlikely(frags >= MAX_SKB_FRAGS)) {
++                      DPRINTK("Too many frags\n");
++                      return -frags;
++              }
++
++              memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
++                     sizeof(*txp));
++              if (txp->size > first->size) {
++                      DPRINTK("Frags galore\n");
++                      return -frags;
++              }
++
++              first->size -= txp->size;
++              frags++;
++
++              if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
++                      DPRINTK("txp->offset: %x, size: %u\n",
++                              txp->offset, txp->size);
++                      return -frags;
++              }
++      } while ((txp++)->flags & XEN_NETTXF_more_data);
++
++      return frags;
++}
++
++static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
++                                                struct sk_buff *skb,
++                                                netif_tx_request_t *txp,
++                                                gnttab_map_grant_ref_t *mop)
++{
++      struct skb_shared_info *shinfo = skb_shinfo(skb);
++      skb_frag_t *frags = shinfo->frags;
++      unsigned long pending_idx = *((u16 *)skb->data);
++      int i, start;
++
++      /* Skip first skb fragment if it is on same page as header fragment. */
++      start = ((unsigned long)shinfo->frags[0].page == pending_idx);
++
++      for (i = start; i < shinfo->nr_frags; i++, txp++) {
++              struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
++              pending_ring_idx_t index = MASK_PEND_IDX(netbk->pending_cons++);
++              struct pending_tx_info *pending_tx_info =
++                      netbk->pending_tx_info;
++
++              pending_idx = netbk->pending_ring[index];
++
++              gnttab_set_map_op(mop++, idx_to_kaddr(netbk, pending_idx),
++                                GNTMAP_host_map | GNTMAP_readonly,
++                                txp->gref, netif->domid);
++
++              memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
++              netif_get(netif);
++              pending_tx_info[pending_idx].netif = netif;
++              frags[i].page = (void *)pending_idx;
++      }
++
++      return mop;
++}
++
++static int netbk_tx_check_mop(struct xen_netbk *netbk, struct sk_buff *skb,
++                            gnttab_map_grant_ref_t **mopp)
++{
++      gnttab_map_grant_ref_t *mop = *mopp;
++      int pending_idx = *((u16 *)skb->data);
++      struct pending_tx_info *pending_tx_info = netbk->pending_tx_info;
++      netif_t *netif = pending_tx_info[pending_idx].netif;
++      netif_tx_request_t *txp;
++      struct skb_shared_info *shinfo = skb_shinfo(skb);
++      int nr_frags = shinfo->nr_frags;
++      int i, err, start;
++
++      /* Check status of header. */
++      err = mop->status;
++      if (unlikely(err != GNTST_okay)) {
++              pending_ring_idx_t index = MASK_PEND_IDX(netbk->pending_prod++);
++
++              txp = &pending_tx_info[pending_idx].req;
++              make_tx_response(netif, txp, XEN_NETIF_RSP_ERROR);
++              netbk->pending_ring[index] = pending_idx;
++              netif_put(netif);
++      } else {
++              set_phys_to_machine(idx_to_pfn(netbk, pending_idx),
++                      FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
++              netbk->grant_tx_handle[pending_idx] = mop->handle;
++      }
++
++      /* Skip first skb fragment if it is on same page as header fragment. */
++      start = ((unsigned long)shinfo->frags[0].page == pending_idx);
++
++      for (i = start; i < nr_frags; i++) {
++              int j, newerr;
++              pending_ring_idx_t index;
++
++              pending_idx = (unsigned long)shinfo->frags[i].page;
++
++              /* Check error status: if okay then remember grant handle. */
++              newerr = (++mop)->status;
++              if (likely(newerr == GNTST_okay)) {
++                      set_phys_to_machine(idx_to_pfn(netbk, pending_idx),
++                              FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
++                      netbk->grant_tx_handle[pending_idx] = mop->handle;
++                      /* Had a previous error? Invalidate this fragment. */
++                      if (unlikely(err != GNTST_okay))
++                              netif_idx_release(netbk, pending_idx);
++                      continue;
++              }
++
++              /* Error on this fragment: respond to client with an error. */
++              txp = &pending_tx_info[pending_idx].req;
++              make_tx_response(netif, txp, XEN_NETIF_RSP_ERROR);
++              index = MASK_PEND_IDX(netbk->pending_prod++);
++              netbk->pending_ring[index] = pending_idx;
++              netif_put(netif);
++
++              /* Not the first error? Preceding frags already invalidated. */
++              if (err != GNTST_okay)
++                      continue;
++
++              /* First error: invalidate header and preceding fragments. */
++              pending_idx = *((u16 *)skb->data);
++              netif_idx_release(netbk, pending_idx);
++              for (j = start; j < i; j++) {
++                      pending_idx = (unsigned long)shinfo->frags[i].page;
++                      netif_idx_release(netbk, pending_idx);
++              }
++
++              /* Remember the error: invalidate all subsequent fragments. */
++              err = newerr;
++      }
++
++      *mopp = mop + 1;
++      return err;
++}
++
++static void netbk_fill_frags(struct xen_netbk *netbk, struct sk_buff *skb)
++{
++      struct skb_shared_info *shinfo = skb_shinfo(skb);
++      int nr_frags = shinfo->nr_frags;
++      int i;
++
++      for (i = 0; i < nr_frags; i++) {
++              skb_frag_t *frag = shinfo->frags + i;
++              netif_tx_request_t *txp;
++              unsigned long pending_idx;
++
++              pending_idx = (unsigned long)frag->page;
++
++              netbk->pending_inuse[pending_idx].alloc_time = jiffies;
++              list_add_tail(&netbk->pending_inuse[pending_idx].list,
++                            &netbk->pending_inuse_head);
++
++              txp = &netbk->pending_tx_info[pending_idx].req;
++              frag->page = netbk->mmap_pages[pending_idx];
++              frag->size = txp->size;
++              frag->page_offset = txp->offset;
++
++              skb->len += txp->size;
++              skb->data_len += txp->size;
++              skb->truesize += txp->size;
++      }
++}
++
++int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
++                   int work_to_do)
++{
++      struct netif_extra_info extra;
++      RING_IDX cons = netif->tx.req_cons;
++
++      do {
++              if (unlikely(work_to_do-- <= 0)) {
++                      DPRINTK("Missing extra info\n");
++                      return -EBADR;
++              }
++
++              memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
++                     sizeof(extra));
++              if (unlikely(!extra.type ||
++                           extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
++                      netif->tx.req_cons = ++cons;
++                      DPRINTK("Invalid extra type: %d\n", extra.type);
++                      return -EINVAL;
++              }
++
++              memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
++              netif->tx.req_cons = ++cons;
++      } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
++
++      return work_to_do;
++}
++
++static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
++{
++      if (!gso->u.gso.size) {
++              DPRINTK("GSO size must not be zero.\n");
++              return -EINVAL;
++      }
++
++      /* Currently only TCPv4 S.O. is supported. */
++      if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
++              DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
++              return -EINVAL;
++      }
++
++      skb_shinfo(skb)->gso_size = gso->u.gso.size;
++      skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
++
++      /* Header must be checked, and gso_segs computed. */
++      skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
++      skb_shinfo(skb)->gso_segs = 0;
++
++      return 0;
++}
++
++/* Called after netfront has transmitted */
++static void net_tx_action(unsigned long group)
++{
++      struct xen_netbk *netbk = &xen_netbk[group];
++      struct sk_buff *skb;
++      netif_t *netif;
++      netif_tx_request_t txreq;
++      netif_tx_request_t txfrags[MAX_SKB_FRAGS];
++      struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
++      u16 pending_idx;
++      RING_IDX i;
++      gnttab_map_grant_ref_t *mop;
++      unsigned int data_len;
++      int ret, work_to_do;
++
++      net_tx_action_dealloc(netbk);
++
++      mop = netbk->tx_map_ops;
++      BUILD_BUG_ON(MAX_SKB_FRAGS >= MAX_PENDING_REQS);
++      while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
++             !list_empty(&netbk->schedule_list)) {
++              /* Get a netif from the list with work to do. */
++              netif = poll_net_schedule_list(netbk);
++              if (!netif)
++                      continue;
++
++              RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
++              if (!work_to_do) {
++                      netif_put(netif);
++                      continue;
++              }
++
++              i = netif->tx.req_cons;
++              rmb(); /* Ensure that we see the request before we copy it. */
++              memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
++
++              /* Credit-based scheduling. */
++              if (txreq.size > netif->remaining_credit) {
++                      unsigned long now = jiffies;
++                      unsigned long next_credit = 
++                              netif->credit_timeout.expires +
++                              msecs_to_jiffies(netif->credit_usec / 1000);
++
++                      /* Timer could already be pending in rare cases. */
++                      if (timer_pending(&netif->credit_timeout)) {
++                              netif_put(netif);
++                              continue;
++                      }
++
++                      /* Passed the point where we can replenish credit? */
++                      if (time_after_eq(now, next_credit)) {
++                              netif->credit_timeout.expires = now;
++                              tx_add_credit(netif);
++                      }
++
++                      /* Still too big to send right now? Set a callback. */
++                      if (txreq.size > netif->remaining_credit) {
++                              netif->credit_timeout.data     =
++                                      (unsigned long)netif;
++                              netif->credit_timeout.function =
++                                      tx_credit_callback;
++                              mod_timer(&netif->credit_timeout, next_credit);
++                              netif_put(netif);
++                              continue;
++                      }
++              }
++              netif->remaining_credit -= txreq.size;
++
++              work_to_do--;
++              netif->tx.req_cons = ++i;
++
++              memset(extras, 0, sizeof(extras));
++              if (txreq.flags & XEN_NETTXF_extra_info) {
++                      work_to_do = netbk_get_extras(netif, extras,
++                                                    work_to_do);
++                      i = netif->tx.req_cons;
++                      if (unlikely(work_to_do < 0)) {
++                              netbk_tx_err(netif, &txreq, i);
++                              continue;
++                      }
++              }
++
++              ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
++              if (unlikely(ret < 0)) {
++                      netbk_tx_err(netif, &txreq, i - ret);
++                      continue;
++              }
++              i += ret;
++
++              if (unlikely(txreq.size < ETH_HLEN)) {
++                      DPRINTK("Bad packet size: %d\n", txreq.size);
++                      netbk_tx_err(netif, &txreq, i);
++                      continue;
++              }
++
++              /* No crossing a page as the payload mustn't fragment. */
++              if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
++                      DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", 
++                              txreq.offset, txreq.size, 
++                              (txreq.offset &~PAGE_MASK) + txreq.size);
++                      netbk_tx_err(netif, &txreq, i);
++                      continue;
++              }
++
++              pending_idx = netbk->pending_ring[MASK_PEND_IDX(netbk->pending_cons)];
++
++              data_len = (txreq.size > PKT_PROT_LEN &&
++                          ret < MAX_SKB_FRAGS) ?
++                      PKT_PROT_LEN : txreq.size;
++
++              skb = alloc_skb(data_len + 16 + NET_IP_ALIGN,
++                              GFP_ATOMIC | __GFP_NOWARN);
++              if (unlikely(skb == NULL)) {
++                      DPRINTK("Can't allocate a skb in start_xmit.\n");
++                      netbk_tx_err(netif, &txreq, i);
++                      break;
++              }
++
++              /* Packets passed to netif_rx() must have some headroom. */
++              skb_reserve(skb, 16 + NET_IP_ALIGN);
++
++              if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
++                      struct netif_extra_info *gso;
++                      gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
++
++                      if (netbk_set_skb_gso(skb, gso)) {
++                              kfree_skb(skb);
++                              netbk_tx_err(netif, &txreq, i);
++                              continue;
++                      }
++              }
++
++              gnttab_set_map_op(mop, idx_to_kaddr(netbk, pending_idx),
++                                GNTMAP_host_map | GNTMAP_readonly,
++                                txreq.gref, netif->domid);
++              mop++;
++
++              memcpy(&netbk->pending_tx_info[pending_idx].req,
++                     &txreq, sizeof(txreq));
++              netbk->pending_tx_info[pending_idx].netif = netif;
++              *((u16 *)skb->data) = pending_idx;
++
++              __skb_put(skb, data_len);
++
++              skb_shinfo(skb)->nr_frags = ret;
++              if (data_len < txreq.size) {
++                      skb_shinfo(skb)->nr_frags++;
++                      skb_shinfo(skb)->frags[0].page =
++                              (void *)(unsigned long)pending_idx;
++              } else {
++                      /* Discriminate from any valid pending_idx value. */
++                      skb_shinfo(skb)->frags[0].page = (void *)~0UL;
++              }
++
++              __skb_queue_tail(&netbk->tx_queue, skb);
++
++              netbk->pending_cons++;
++
++              mop = netbk_get_requests(netif, skb, txfrags, mop);
++
++              netif->tx.req_cons = i;
++              netif_schedule_work(netif);
++
++              if ((mop - netbk->tx_map_ops) >= ARRAY_SIZE(netbk->tx_map_ops))
++                      break;
++      }
++
++      if (mop == netbk->tx_map_ops)
++              goto out;
++
++    /* NOTE: some maps may fail with GNTST_eagain, which could be successfully
++     * retried in the backend after a delay. However, we can also fail the tx
++     * req and let the frontend resend the relevant packet again. This is fine
++     * because it is unlikely that a network buffer will be paged out or shared,
++     * and therefore it is unlikely to fail with GNTST_eagain. */
++      ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++                                      netbk->tx_map_ops,
++                                      mop - netbk->tx_map_ops);
++      BUG_ON(ret);
++
++      mop = netbk->tx_map_ops;
++      while ((skb = __skb_dequeue(&netbk->tx_queue)) != NULL) {
++              struct net_device *dev;
++              netif_tx_request_t *txp;
++
++              pending_idx = *((u16 *)skb->data);
++              netif       = netbk->pending_tx_info[pending_idx].netif;
++              dev         = netif->dev;
++              txp         = &netbk->pending_tx_info[pending_idx].req;
++
++              /* Check the remap error code. */
++              if (unlikely(netbk_tx_check_mop(netbk, skb, &mop))) {
++                      DPRINTK("netback grant failed.\n");
++                      skb_shinfo(skb)->nr_frags = 0;
++                      kfree_skb(skb);
++                      dev->stats.rx_dropped++;
++                      continue;
++              }
++
++              data_len = skb->len;
++              memcpy(skb->data,
++                     (void *)(idx_to_kaddr(netbk, pending_idx)|txp->offset),
++                     data_len);
++              if (data_len < txp->size) {
++                      /* Append the packet payload as a fragment. */
++                      txp->offset += data_len;
++                      txp->size -= data_len;
++              } else {
++                      /* Schedule a response immediately. */
++                      netif_idx_release(netbk, pending_idx);
++              }
++
++              if (txp->flags & XEN_NETTXF_csum_blank)
++                      skb->ip_summed = CHECKSUM_PARTIAL;
++              else if (txp->flags & XEN_NETTXF_data_validated)
++                      skb->ip_summed = CHECKSUM_UNNECESSARY;
++              else
++                      skb->ip_summed = CHECKSUM_NONE;
++
++              netbk_fill_frags(netbk, skb);
++
++              /*
++               * If the initial fragment was < PKT_PROT_LEN then
++               * pull through some bytes from the other fragments to
++               * increase the linear region to PKT_PROT_LEN bytes.
++               */
++              if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
++                      int target = min_t(int, skb->len, PKT_PROT_LEN);
++                      __pskb_pull_tail(skb, target - skb_headlen(skb));
++              }
++
++              skb->protocol = eth_type_trans(skb, dev);
++
++              if (skb_checksum_setup(skb, &netif->rx_gso_csum_fixups)) {
++                      DPRINTK("Can't setup checksum in net_tx_action\n");
++                      kfree_skb(skb);
++                      continue;
++              }
++
++              if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
++                  unlikely(skb_linearize(skb))) {
++                      DPRINTK("Can't linearize skb in net_tx_action.\n");
++                      kfree_skb(skb);
++                      dev->stats.rx_errors++;
++                      continue;
++              }
++
++              dev->stats.rx_bytes += skb->len;
++              dev->stats.rx_packets++;
++
++              if (use_kthreads)
++                      netif_rx_ni(skb);
++              else
++                      netif_rx(skb);
++      }
++
++ out:
++      if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
++          !list_empty(&netbk->pending_inuse_head)) {
++              struct netbk_tx_pending_inuse *oldest;
++
++              oldest = list_entry(netbk->pending_inuse_head.next,
++                                  struct netbk_tx_pending_inuse, list);
++              mod_timer(&netbk->tx_pending_timer, oldest->alloc_time + HZ);
++      }
++}
++
++static void netif_idx_release(struct xen_netbk *netbk, u16 pending_idx)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&netbk->release_lock, flags);
++      netbk->dealloc_ring[MASK_PEND_IDX(netbk->dealloc_prod)] = pending_idx;
++      /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
++      smp_wmb();
++      netbk->dealloc_prod++;
++      spin_unlock_irqrestore(&netbk->release_lock, flags);
++
++      netbk_schedule(netbk);
++}
++
++static void netif_page_release(struct page *page, unsigned int order)
++{
++      unsigned int idx = netif_page_index(page);
++      unsigned int group = netif_page_group(page);
++      struct xen_netbk *netbk = &xen_netbk[group];
++
++      BUG_ON(order);
++      BUG_ON(group >= netbk_nr_groups || idx >= MAX_PENDING_REQS);
++      BUG_ON(netbk->mmap_pages[idx] != page);
++      netif_idx_release(netbk, idx);
++}
++
++irqreturn_t netif_be_int(int irq, void *dev_id)
++{
++      netif_t *netif = dev_id;
++      unsigned int group = GET_GROUP_INDEX(netif);
++
++      if (unlikely(group >= netbk_nr_groups)) {
++              /*
++               * Short of having a way to bind the IRQ in disabled mode
++               * (IRQ_NOAUTOEN), we have to ignore the first invocation(s)
++               * (before we got assigned to a group).
++               */
++              BUG_ON(group != UINT_MAX);
++              return IRQ_HANDLED;
++      }
++
++      add_to_net_schedule_list_tail(netif);
++      maybe_schedule_tx_action(group);
++
++      if (netif_schedulable(netif) && !netbk_queue_full(netif))
++              netif_wake_queue(netif->dev);
++
++      return IRQ_HANDLED;
++}
++
++static void make_tx_response(netif_t *netif, 
++                           netif_tx_request_t *txp,
++                           s8       st)
++{
++      RING_IDX i = netif->tx.rsp_prod_pvt;
++      netif_tx_response_t *resp;
++      int notify;
++
++      resp = RING_GET_RESPONSE(&netif->tx, i);
++      resp->id     = txp->id;
++      resp->status = st;
++
++      if (txp->flags & XEN_NETTXF_extra_info)
++              RING_GET_RESPONSE(&netif->tx, ++i)->status = XEN_NETIF_RSP_NULL;
++
++      netif->tx.rsp_prod_pvt = ++i;
++      RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
++      if (notify)
++              notify_remote_via_irq(netif->irq);
++
++#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
++      if (i == netif->tx.req_cons) {
++              int more_to_do;
++              RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
++              if (more_to_do)
++                      add_to_net_schedule_list_tail(netif);
++      }
++#endif
++}
++
++static netif_rx_response_t *make_rx_response(netif_t *netif, 
++                                           u16      id, 
++                                           s8       st,
++                                           u16      offset,
++                                           u16      size,
++                                           u16      flags)
++{
++      RING_IDX i = netif->rx.rsp_prod_pvt;
++      netif_rx_response_t *resp;
++
++      resp = RING_GET_RESPONSE(&netif->rx, i);
++      resp->offset     = offset;
++      resp->flags      = flags;
++      resp->id         = id;
++      resp->status     = (s16)size;
++      if (st < 0)
++              resp->status = (s16)st;
++
++      netif->rx.rsp_prod_pvt = ++i;
++
++      return resp;
++}
++
++#ifdef NETBE_DEBUG_INTERRUPT
++static irqreturn_t netif_be_dbg(int irq, void *dev_id)
++{
++      netif_t *netif;
++      unsigned int i = 0, group;
++
++      pr_alert("netif_schedule_list:\n");
++
++      for (group = 0; group < netbk_nr_groups; ++group) {
++              struct xen_netbk *netbk = &xen_netbk[group];
++
++              spin_lock_irq(&netbk->schedule_list_lock);
++
++              list_for_each_entry(netif, &netbk->schedule_list, list) {
++                      pr_alert(" %d: private(rx_req_cons=%08x "
++                               "rx_resp_prod=%08x\n", i,
++                               netif->rx.req_cons, netif->rx.rsp_prod_pvt);
++                      pr_alert("   tx_req_cons=%08x tx_resp_prod=%08x)\n",
++                               netif->tx.req_cons, netif->tx.rsp_prod_pvt);
++                      pr_alert("   shared(rx_req_prod=%08x "
++                               "rx_resp_prod=%08x\n",
++                               netif->rx.sring->req_prod,
++                               netif->rx.sring->rsp_prod);
++                      pr_alert("   rx_event=%08x tx_req_prod=%08x\n",
++                               netif->rx.sring->rsp_event,
++                               netif->tx.sring->req_prod);
++                      pr_alert("   tx_resp_prod=%08x, tx_event=%08x)\n",
++                               netif->tx.sring->rsp_prod,
++                               netif->tx.sring->rsp_event);
++                      i++;
++              }
++
++              spin_unlock_irq(&netbk->netbk->schedule_list_lock);
++      }
++
++      pr_alert(" ** End of netif_schedule_list **\n");
++
++      return IRQ_HANDLED;
++}
++
++static struct irqaction netif_be_dbg_action = {
++      .handler = netif_be_dbg,
++      .flags   = IRQF_SHARED,
++      .name    = "net-be-dbg"
++};
++#endif
++
++static inline int rx_work_todo(struct xen_netbk *netbk)
++{
++      return !skb_queue_empty(&netbk->rx_queue);
++}
++
++static inline int tx_work_todo(struct xen_netbk *netbk)
++{
++      if (netbk->dealloc_cons != netbk->dealloc_prod)
++              return 1;
++
++      if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
++          !list_empty(&netbk->pending_inuse_head))
++              return 1;
++
++      if (nr_pending_reqs(netbk) + MAX_SKB_FRAGS < MAX_PENDING_REQS &&
++          !list_empty(&netbk->schedule_list))
++              return 1;
++
++      return 0;
++}
++
++static int netbk_action_thread(void *index)
++{
++      unsigned long group = (unsigned long)index;
++      struct xen_netbk *netbk = &xen_netbk[group];
++
++      while (!kthread_should_stop()) {
++              wait_event_interruptible(netbk->netbk_action_wq,
++                                       rx_work_todo(netbk) ||
++                                       tx_work_todo(netbk) ||
++                                       kthread_should_stop());
++              cond_resched();
++
++              if (rx_work_todo(netbk))
++                      net_rx_action(group);
++
++              if (tx_work_todo(netbk))
++                      net_tx_action(group);
++      }
++
++      return 0;
++}
++
++
++static int __init netback_init(void)
++{
++      unsigned int i, group;
++      int rc;
++      struct page *page;
++
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      group = netbk_nr_groups;
++      if (!netbk_nr_groups)
++              netbk_nr_groups = (num_online_cpus() + 1) / 2;
++      if (netbk_nr_groups > MAX_GROUPS)
++              netbk_nr_groups = MAX_GROUPS;
++
++      do {
++              xen_netbk = vzalloc(netbk_nr_groups * sizeof(*xen_netbk));
++      } while (!xen_netbk && (netbk_nr_groups >>= 1));
++      if (!xen_netbk) {
++              pr_err("%s: out of memory\n", __func__);
++              return -ENOMEM;
++      }
++      if (group && netbk_nr_groups != group)
++              pr_warn("netback: only using %u (instead of %u) groups\n",
++                      netbk_nr_groups, group);
++
++      /* We can increase reservation by this much in net_rx_action(). */
++      balloon_update_driver_allowance(netbk_nr_groups * NET_RX_RING_SIZE);
++
++      for (group = 0; group < netbk_nr_groups; group++) {
++              struct xen_netbk *netbk = &xen_netbk[group];
++
++              skb_queue_head_init(&netbk->rx_queue);
++              skb_queue_head_init(&netbk->tx_queue);
++
++              init_timer(&netbk->net_timer);
++              netbk->net_timer.data = group;
++              netbk->net_timer.function = netbk_schedule_group;
++
++              init_timer(&netbk->tx_pending_timer);
++              netbk->tx_pending_timer.data = group;
++              netbk->tx_pending_timer.function = netbk_schedule_group;
++
++              netbk->pending_prod = MAX_PENDING_REQS;
++
++              INIT_LIST_HEAD(&netbk->pending_inuse_head);
++              INIT_LIST_HEAD(&netbk->schedule_list);
++
++              spin_lock_init(&netbk->schedule_list_lock);
++              spin_lock_init(&netbk->release_lock);
++
++              netbk->mmap_pages =
++                      alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
++              if (netbk->mmap_pages == NULL) {
++                      pr_err("%s: out of memory\n", __func__);
++                      rc = -ENOMEM;
++                      goto failed_init;
++              }
++
++              for (i = 0; i < MAX_PENDING_REQS; i++) {
++                      page = netbk->mmap_pages[i];
++                      SetPageForeign(page, netif_page_release);
++                      netif_set_page_ext(page, group, i);
++                      netbk->pending_ring[i] = i;
++                      INIT_LIST_HEAD(&netbk->pending_inuse[i].list);
++              }
++
++              if (use_kthreads) {
++                      init_waitqueue_head(&netbk->netbk_action_wq);
++                      netbk->task = kthread_create(netbk_action_thread,
++                                                   (void *)(long)group,
++                                                   "netback/%u", group);
++
++                      if (IS_ERR(netbk->task)) {
++                              pr_err("netback: kthread_create() failed\n");
++                              rc = PTR_ERR(netbk->task);
++                              goto failed_init;
++                      }
++                      if (bind_threads)
++                              kthread_bind(netbk->task, group);
++                      wake_up_process(netbk->task);
++              } else {
++                      tasklet_init(&netbk->net_tx_tasklet, net_tx_action, group);
++                      tasklet_init(&netbk->net_rx_tasklet, net_rx_action, group);
++              }
++      }
++
++      netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
++      if (MODPARM_copy_skb) {
++              if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
++                                            NULL, 0))
++                      netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB;
++              else
++                      netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB;
++      }
++
++      netif_accel_init();
++
++      netif_xenbus_init();
++
++#ifdef NETBE_DEBUG_INTERRUPT
++      (void)bind_virq_to_irqaction(VIRQ_DEBUG,
++                                   0,
++                                   &netif_be_dbg_action);
++#endif
++
++      return 0;
++
++failed_init:
++      do {
++              struct xen_netbk *netbk = &xen_netbk[group];
++
++              if (use_kthreads && netbk->task && !IS_ERR(netbk->task))
++                      kthread_stop(netbk->task);
++              if (netbk->mmap_pages)
++                      free_empty_pages_and_pagevec(netbk->mmap_pages,
++                                                   MAX_PENDING_REQS);
++      } while (group--);
++      vfree(xen_netbk);
++      balloon_update_driver_allowance(-(long)netbk_nr_groups
++                                      * NET_RX_RING_SIZE);
++
++      return rc;
++}
++
++module_init(netback_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/netback/xenbus.c

index 0000000,0000000..8a27092

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/netback/xenbus.c
@@@ -1,0 -1,0 +1,459 @@@
++/*  Xenbus code for netif backend
++    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
++    Copyright (C) 2005 XenSource Ltd
++
++    This program is free software; you can redistribute it and/or modify
++    it under the terms of the GNU General Public License as published by
++    the Free Software Foundation; either version 2 of the License, or
++    (at your option) any later version.
++
++    This program is distributed in the hope that it will be useful,
++    but WITHOUT ANY WARRANTY; without even the implied warranty of
++    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++    GNU General Public License for more details.
++
++    You should have received a copy of the GNU General Public License
++    along with this program; if not, write to the Free Software
++    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++*/
++
++#include <stdarg.h>
++#include <linux/module.h>
++#include <linux/rwsem.h>
++#include <xen/xenbus.h>
++#include "common.h"
++
++#if 0
++#undef DPRINTK
++#define DPRINTK(fmt, args...) \
++    printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
++#endif
++
++static DECLARE_RWSEM(teardown_sem);
++
++static int connect_rings(struct backend_info *);
++static void connect(struct backend_info *);
++static void backend_create_netif(struct backend_info *be);
++static void netback_disconnect(struct device *, int);
++
++static int netback_remove(struct xenbus_device *dev)
++{
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++      netback_remove_accelerators(be, dev);
++
++      netback_disconnect(&dev->dev, 1);
++      kfree(be);
++      return 0;
++}
++
++static void netback_disconnect(struct device *xbdev_dev, int clear)
++{
++      struct backend_info *be = dev_get_drvdata(xbdev_dev);
++
++      if (be->netif)
++              kobject_uevent(&xbdev_dev->kobj, KOBJ_OFFLINE);
++
++      down_write(&teardown_sem);
++      if (be->netif) {
++              netif_disconnect(be);
++              be->netif = NULL;
++      }
++      if (clear)
++              dev_set_drvdata(xbdev_dev, NULL);
++      up_write(&teardown_sem);
++}
++
++/**
++ * Entry point to this code when a new device is created.  Allocate the basic
++ * structures and switch to InitWait.
++ */
++static int netback_probe(struct xenbus_device *dev,
++                       const struct xenbus_device_id *id)
++{
++      const char *message;
++      struct xenbus_transaction xbt;
++      int err;
++      int sg;
++      struct backend_info *be = kzalloc(sizeof(struct backend_info),
++                                        GFP_KERNEL);
++      if (!be) {
++              xenbus_dev_fatal(dev, -ENOMEM,
++                               "allocating backend structure");
++              return -ENOMEM;
++      }
++
++      be->dev = dev;
++      dev_set_drvdata(&dev->dev, be);
++
++      sg = 1;
++      if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
++              sg = 0;
++
++      do {
++              err = xenbus_transaction_start(&xbt);
++              if (err) {
++                      xenbus_dev_fatal(dev, err, "starting transaction");
++                      goto fail;
++              }
++
++              err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
++              if (err) {
++                      message = "writing feature-sg";
++                      goto abort_transaction;
++              }
++
++              err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
++                                  "%d", sg);
++              if (err) {
++                      message = "writing feature-gso-tcpv4";
++                      goto abort_transaction;
++              }
++
++              /* We support rx-copy path. */
++              err = xenbus_printf(xbt, dev->nodename,
++                                  "feature-rx-copy", "%d", 1);
++              if (err) {
++                      message = "writing feature-rx-copy";
++                      goto abort_transaction;
++              }
++
++              /*
++               * We don't support rx-flip path (except old guests who don't
++               * grok this feature flag).
++               */
++              err = xenbus_printf(xbt, dev->nodename,
++                                  "feature-rx-flip", "%d", 0);
++              if (err) {
++                      message = "writing feature-rx-flip";
++                      goto abort_transaction;
++              }
++
++              err = xenbus_transaction_end(xbt, 0);
++      } while (err == -EAGAIN);
++
++      if (err) {
++              xenbus_dev_fatal(dev, err, "completing transaction");
++              goto fail;
++      }
++
++      netback_probe_accelerators(be, dev);
++
++      err = xenbus_switch_state(dev, XenbusStateInitWait);
++      if (err)
++              goto fail;
++
++      /* This kicks hotplug scripts, so do it immediately. */
++      backend_create_netif(be);
++
++      return 0;
++
++abort_transaction:
++      xenbus_transaction_end(xbt, 1);
++      xenbus_dev_fatal(dev, err, "%s", message);
++fail:
++      DPRINTK("failed");
++      netback_remove(dev);
++      return err;
++}
++
++
++/**
++ * Handle the creation of the hotplug script environment.  We add the script
++ * and vif variables to the environment, for the benefit of the vif-* hotplug
++ * scripts.
++ */
++static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env)
++{
++      struct backend_info *be;
++      char *val;
++
++      DPRINTK("netback_uevent");
++
++      val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
++      if (IS_ERR(val)) {
++              int err = PTR_ERR(val);
++              xenbus_dev_fatal(xdev, err, "reading script");
++              return err;
++      }
++
++      add_uevent_var(env, "script=%s", val);
++      kfree(val);
++
++      down_read(&teardown_sem);
++      be = dev_get_drvdata(&xdev->dev);
++      if (be && be->netif)
++              add_uevent_var(env, "vif=%s", be->netif->dev->name);
++      up_read(&teardown_sem);
++
++      return 0;
++}
++
++
++static void backend_create_netif(struct backend_info *be)
++{
++      int err;
++      long handle;
++      struct xenbus_device *dev = be->dev;
++      netif_t *netif;
++
++      if (be->netif != NULL)
++              return;
++
++      err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
++      if (err != 1) {
++              xenbus_dev_fatal(dev, err, "reading handle");
++              return;
++      }
++
++      netif = netif_alloc(&dev->dev, dev->otherend_id, handle);
++      if (IS_ERR(netif)) {
++              err = PTR_ERR(netif);
++              xenbus_dev_fatal(dev, err, "creating interface");
++              return;
++      }
++      be->netif = netif;
++
++      kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
++}
++
++
++/**
++ * Callback received when the frontend's state changes.
++ */
++static void frontend_changed(struct xenbus_device *dev,
++                           enum xenbus_state frontend_state)
++{
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++      DPRINTK("%s", xenbus_strstate(frontend_state));
++
++      be->frontend_state = frontend_state;
++
++      switch (frontend_state) {
++      case XenbusStateInitialising:
++              if (dev->state == XenbusStateClosed) {
++                      pr_info("%s: %s: prepare for reconnect\n",
++                              __FUNCTION__, dev->nodename);
++                      xenbus_switch_state(dev, XenbusStateInitWait);
++              }
++              break;
++
++      case XenbusStateInitialised:
++              break;
++
++      case XenbusStateConnected:
++              if (dev->state == XenbusStateConnected)
++                      break;
++
++              /* backend_create_netif() is idempotent */
++              backend_create_netif(be);
++              if (be->netif)
++                      connect(be);
++              break;
++
++      case XenbusStateClosing:
++              netback_disconnect(&dev->dev, 0);
++              xenbus_switch_state(dev, XenbusStateClosing);
++              break;
++
++      case XenbusStateClosed:
++              xenbus_switch_state(dev, XenbusStateClosed);
++              if (xenbus_dev_is_online(dev))
++                      break;
++              /* fall through if not online */
++      case XenbusStateUnknown:
++              /* implies netback_disconnect() via netback_remove() */
++              device_unregister(&dev->dev);
++              break;
++
++      default:
++              xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++                               frontend_state);
++              break;
++      }
++}
++
++
++static void xen_net_read_rate(struct xenbus_device *dev,
++                            unsigned long *bytes, unsigned long *usec)
++{
++      char *s, *e;
++      unsigned long b, u;
++      char *ratestr;
++
++      /* Default to unlimited bandwidth. */
++      *bytes = ~0UL;
++      *usec = 0;
++
++      ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
++      if (IS_ERR(ratestr))
++              return;
++
++      s = ratestr;
++      b = simple_strtoul(s, &e, 10);
++      if ((s == e) || (*e != ','))
++              goto fail;
++
++      s = e + 1;
++      u = simple_strtoul(s, &e, 10);
++      if ((s == e) || (*e != '\0'))
++              goto fail;
++
++      *bytes = b;
++      *usec = u;
++
++      kfree(ratestr);
++      return;
++
++ fail:
++      WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
++      kfree(ratestr);
++}
++
++static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
++{
++      char *s, *e, *macstr;
++      int i;
++
++      macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
++      if (IS_ERR(macstr))
++              return PTR_ERR(macstr);
++
++      for (i = 0; i < ETH_ALEN; i++) {
++              mac[i] = simple_strtoul(s, &e, 16);
++              if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
++                      kfree(macstr);
++                      return -ENOENT;
++              }
++              s = e+1;
++      }
++
++      kfree(macstr);
++      return 0;
++}
++
++static void connect(struct backend_info *be)
++{
++      int err;
++      struct xenbus_device *dev = be->dev;
++
++      err = connect_rings(be);
++      if (err)
++              return;
++
++      err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
++              return;
++      }
++
++      xen_net_read_rate(dev, &be->netif->credit_bytes,
++                        &be->netif->credit_usec);
++      be->netif->remaining_credit = be->netif->credit_bytes;
++
++      xenbus_switch_state(dev, XenbusStateConnected);
++
++      netif_wake_queue(be->netif->dev);
++}
++
++
++static int connect_rings(struct backend_info *be)
++{
++      netif_t *netif = be->netif;
++      struct xenbus_device *dev = be->dev;
++      unsigned long tx_ring_ref, rx_ring_ref;
++      unsigned int evtchn, rx_copy;
++      int err;
++      int val;
++
++      DPRINTK("");
++
++      err = xenbus_gather(XBT_NIL, dev->otherend,
++                          "tx-ring-ref", "%lu", &tx_ring_ref,
++                          "rx-ring-ref", "%lu", &rx_ring_ref,
++                          "event-channel", "%u", &evtchn, NULL);
++      if (err) {
++              xenbus_dev_fatal(dev, err,
++                               "reading %s/ring-ref and event-channel",
++                               dev->otherend);
++              return err;
++      }
++
++      err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
++                         &rx_copy);
++      if (err == -ENOENT) {
++              err = 0;
++              rx_copy = 0;
++      }
++      if (err < 0) {
++              xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
++                               dev->otherend);
++              return err;
++      }
++      netif->copying_receiver = !!rx_copy;
++
++      if (netif->dev->tx_queue_len != 0) {
++              if (xenbus_scanf(XBT_NIL, dev->otherend,
++                               "feature-rx-notify", "%d", &val) < 0)
++                      val = 0;
++              if (val)
++                      netif->can_queue = 1;
++              else
++                      /* Must be non-zero for pfifo_fast to work. */
++                      netif->dev->tx_queue_len = 1;
++      }
++
++      if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
++              val = 0;
++      netif->can_sg = !!val;
++
++      if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
++                       &val) < 0)
++              val = 0;
++      netif->gso = !!val;
++
++      if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
++                       "%d", &val) < 0)
++              val = 0;
++      netif->csum = !val;
++
++      /* Set dev->features */
++      netif_set_features(netif);
++
++      /* Map the shared frame, irq etc. */
++      err = netif_map(be, tx_ring_ref, rx_ring_ref, evtchn);
++      if (err) {
++              xenbus_dev_fatal(dev, err,
++                               "mapping shared-frames %lu/%lu port %u",
++                               tx_ring_ref, rx_ring_ref, evtchn);
++              return err;
++      }
++      return 0;
++}
++
++
++/* ** Driver Registration ** */
++
++
++static const struct xenbus_device_id netback_ids[] = {
++      { "vif" },
++      { "" }
++};
++
++
++static struct xenbus_driver netback = {
++      .name = "vif",
++      .ids = netback_ids,
++      .probe = netback_probe,
++      .remove = netback_remove,
++      .uevent = netback_uevent,
++      .otherend_changed = frontend_changed,
++};
++
++
++void netif_xenbus_init(void)
++{
++      if (xenbus_register_backend(&netback))
++              BUG();
++}
diff --cc drivers/xen/netfront/Makefile

index 0000000,0000000..9c0c6ad

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/netfront/Makefile
@@@ -1,0 -1,0 +1,4 @@@
++
++obj-$(CONFIG_XEN_NETDEV_FRONTEND)     := xennet.o
++
++xennet-objs := netfront.o accel.o
diff --cc drivers/xen/netfront/accel.c

index 0000000,0000000..4e1e854

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/netfront/accel.c
@@@ -1,0 -1,0 +1,827 @@@
++/******************************************************************************
++ * Virtual network driver for conversing with remote driver backends.
++ *
++ * Copyright (C) 2007 Solarflare Communications, Inc.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/version.h>
++#include <linux/netdevice.h>
++#include <linux/skbuff.h>
++#include <linux/list.h>
++#include <linux/mutex.h>
++#include <asm/hypervisor.h>
++#include <xen/xenbus.h>
++
++#include "netfront.h"
++
++#define DPRINTK(fmt, args...)                         \
++      pr_debug("netfront/accel (%s:%d) " fmt,         \
++             __FUNCTION__, __LINE__, ##args)
++#define IPRINTK(fmt, args...) pr_info("netfront/accel: " fmt, ##args)
++#define WPRINTK(fmt, args...) pr_warning("netfront/accel: " fmt, ##args)
++
++static int netfront_remove_accelerator(struct netfront_info *np,
++                                     struct xenbus_device *dev);
++static int netfront_load_accelerator(struct netfront_info *np, 
++                                   struct xenbus_device *dev, 
++                                   const char *frontend);
++
++static void netfront_accelerator_remove_watch(struct netfront_info *np);
++
++/*
++ * List of all netfront accelerator plugin modules available.  Each
++ * list entry is of type struct netfront_accelerator.
++ */ 
++static struct list_head accelerators_list;
++
++/* Workqueue to process acceleration configuration changes */
++struct workqueue_struct *accel_watch_workqueue;
++
++/* Mutex to prevent concurrent loads and suspends, etc. */
++DEFINE_MUTEX(accelerator_mutex);
++
++void netif_init_accel(void)
++{
++      INIT_LIST_HEAD(&accelerators_list);
++
++      accel_watch_workqueue = create_workqueue("net_accel");
++}
++
++void netif_exit_accel(void)
++{
++      struct netfront_accelerator *accelerator, *tmp;
++
++      flush_workqueue(accel_watch_workqueue);
++      destroy_workqueue(accel_watch_workqueue);
++
++      /* No lock required as everything else should be quiet by now */
++      list_for_each_entry_safe(accelerator, tmp, &accelerators_list, link) {
++              BUG_ON(!list_empty(&accelerator->vif_states));
++
++              list_del(&accelerator->link);
++              kfree(accelerator->frontend);
++              kfree(accelerator);
++      }
++}
++
++
++/* 
++ * Watch the configured accelerator and change plugin if it's modified 
++ */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
++static void accel_watch_work(struct work_struct *context)
++#else
++static void accel_watch_work(void *context)
++#endif
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
++      struct netfront_accel_vif_state *vif_state = 
++              container_of(context, struct netfront_accel_vif_state, 
++                           accel_work);
++#else
++        struct netfront_accel_vif_state *vif_state = 
++              (struct netfront_accel_vif_state *)context;
++#endif
++      struct netfront_info *np = vif_state->np;
++      char *accel_frontend;
++      int accel_len, rc = -1;
++
++      mutex_lock(&accelerator_mutex);
++
++      accel_frontend = xenbus_read(XBT_NIL, np->xbdev->otherend, 
++                                   "accel-frontend", &accel_len);
++      if (IS_ERR(accel_frontend)) {
++              accel_frontend = NULL;
++              netfront_remove_accelerator(np, np->xbdev);
++      } else {
++              /* If this is the first time, request the accelerator,
++                 otherwise only request one if it has changed */
++              if (vif_state->accel_frontend == NULL) {
++                      rc = netfront_load_accelerator(np, np->xbdev, 
++                                                     accel_frontend);
++              } else {
++                      if (strncmp(vif_state->accel_frontend, accel_frontend,
++                                  accel_len)) {
++                              netfront_remove_accelerator(np, np->xbdev);
++                              rc = netfront_load_accelerator(np, np->xbdev, 
++                                                             accel_frontend);
++                      }
++              }
++      }
++
++      /* Get rid of previous state and replace with the new name */
++      if (vif_state->accel_frontend != NULL)
++              kfree(vif_state->accel_frontend);
++      vif_state->accel_frontend = accel_frontend;
++
++      mutex_unlock(&accelerator_mutex);
++
++      if (rc == 0) {
++              DPRINTK("requesting module %s\n", accel_frontend);
++              request_module("%s", accel_frontend);
++              /*
++               * Module should now call netfront_accelerator_loaded() once
++               * it's up and running, and we can continue from there 
++               */
++      }
++}
++
++
++static void accel_watch_changed(struct xenbus_watch *watch,
++                              const char **vec, unsigned int len)
++{
++      struct netfront_accel_vif_state *vif_state = 
++              container_of(watch, struct netfront_accel_vif_state,
++                           accel_watch);
++      queue_work(accel_watch_workqueue, &vif_state->accel_work);
++}
++
++
++void netfront_accelerator_add_watch(struct netfront_info *np)
++{
++      int err;
++      
++      /* 
++       * If old watch exists, e.g. from before suspend/resume,
++       * remove it now 
++       */
++      netfront_accelerator_remove_watch(np);
++
++      /* Get a watch on the accelerator plugin */
++      err = xenbus_watch_path2(np->xbdev, np->xbdev->otherend, 
++                               "accel-frontend", 
++                               &np->accel_vif_state.accel_watch,
++                               accel_watch_changed);
++      if (err) {
++              DPRINTK("%s: Failed to register accel watch: %d\n",
++                        __FUNCTION__, err);
++              np->accel_vif_state.accel_watch.node = NULL;
++        }
++}
++
++
++static void 
++netfront_accelerator_purge_watch(struct netfront_accel_vif_state *vif_state)
++{
++      flush_workqueue(accel_watch_workqueue);
++
++      /* Clean up any state left from watch */
++      if (vif_state->accel_frontend != NULL) {
++              kfree(vif_state->accel_frontend);
++              vif_state->accel_frontend = NULL;
++      }
++}
++
++
++static
++void netfront_accelerator_remove_watch(struct netfront_info *np)
++{
++      struct netfront_accel_vif_state *vif_state = &np->accel_vif_state;
++
++      /* Get rid of watch on accelerator plugin */
++      if (vif_state->accel_watch.node != NULL) {
++              unregister_xenbus_watch(&vif_state->accel_watch);
++              kfree(vif_state->accel_watch.node);
++              vif_state->accel_watch.node = NULL;
++
++              netfront_accelerator_purge_watch(vif_state);
++      }       
++}
++
++
++/* 
++ * Initialise the accel_vif_state field in the netfront state
++ */ 
++void init_accelerator_vif(struct netfront_info *np,
++                        struct xenbus_device *dev)
++{
++      np->accelerator = NULL;
++
++      /* It's assumed that these things don't change */
++      np->accel_vif_state.np = np;
++      np->accel_vif_state.dev = dev;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
++      INIT_WORK(&np->accel_vif_state.accel_work, accel_watch_work);
++#else
++      INIT_WORK(&np->accel_vif_state.accel_work, accel_watch_work, 
++                &np->accel_vif_state);
++#endif
++}
++
++
++/*
++ * Compare a frontend description string against an accelerator to see
++ * if they match.  Would ultimately be nice to replace the string with
++ * a unique numeric identifier for each accelerator.
++ */
++static int match_accelerator(const char *frontend, 
++                           struct netfront_accelerator *accelerator)
++{
++      return strcmp(frontend, accelerator->frontend) == 0;
++}
++
++
++/* 
++ * Add a frontend vif to the list of vifs that is using a netfront
++ * accelerator plugin module.  Must be called with the accelerator
++ * mutex held.
++ */
++static void add_accelerator_vif(struct netfront_accelerator *accelerator,
++                              struct netfront_info *np)
++{
++      if (np->accelerator == NULL) {
++              np->accelerator = accelerator;
++              
++              list_add(&np->accel_vif_state.link, &accelerator->vif_states);
++      } else {
++              /* 
++               * May get here legitimately if suspend_cancel is
++               * called, but in that case configuration should not
++               * have changed
++               */
++              BUG_ON(np->accelerator != accelerator);
++      }
++}
++
++
++/*
++ * Initialise the state to track an accelerator plugin module.  
++ * 
++ * Must be called with the accelerator mutex held.
++ */ 
++static int init_accelerator(const char *frontend, 
++                          struct netfront_accelerator **result,
++                          struct netfront_accel_hooks *hooks)
++{
++      struct netfront_accelerator *accelerator = 
++              kmalloc(sizeof(struct netfront_accelerator), GFP_KERNEL);
++      int frontend_len;
++
++      if (!accelerator) {
++              DPRINTK("no memory for accelerator\n");
++              return -ENOMEM;
++      }
++
++      frontend_len = strlen(frontend) + 1;
++      accelerator->frontend = kmalloc(frontend_len, GFP_KERNEL);
++      if (!accelerator->frontend) {
++              DPRINTK("no memory for accelerator\n");
++              kfree(accelerator);
++              return -ENOMEM;
++      }
++      strlcpy(accelerator->frontend, frontend, frontend_len);
++      
++      INIT_LIST_HEAD(&accelerator->vif_states);
++      spin_lock_init(&accelerator->vif_states_lock);
++
++      accelerator->hooks = hooks;
++
++      list_add(&accelerator->link, &accelerators_list);
++
++      *result = accelerator;
++
++      return 0;
++}                                     
++
++
++/* 
++ * Modify the hooks stored in the per-vif state to match that in the
++ * netfront accelerator's state.
++ * 
++ * Takes the vif_states_lock spinlock and may sleep.
++ */
++static void 
++accelerator_set_vif_state_hooks(struct netfront_accel_vif_state *vif_state)
++{
++      struct netfront_accelerator *accelerator;
++      unsigned long flags;
++
++      DPRINTK("%p\n",vif_state);
++
++      /* Make sure there are no data path operations going on */
++      napi_disable(&vif_state->np->napi);
++      netif_tx_lock_bh(vif_state->np->netdev);
++
++      accelerator = vif_state->np->accelerator;
++      spin_lock_irqsave(&accelerator->vif_states_lock, flags);
++      vif_state->hooks = accelerator->hooks;
++      spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
++
++      netif_tx_unlock_bh(vif_state->np->netdev);
++      napi_enable(&vif_state->np->napi);
++}
++
++
++/* 
++ * Must be called with the accelerator mutex held.  Takes the
++ * vif_states_lock spinlock.
++ */
++static void accelerator_probe_new_vif(struct netfront_info *np,
++                                    struct xenbus_device *dev, 
++                                    struct netfront_accelerator *accelerator)
++{
++      struct netfront_accel_hooks *hooks;
++
++      DPRINTK("\n");
++
++      /* Include this frontend device on the accelerator's list */
++      add_accelerator_vif(accelerator, np);
++      
++      hooks = accelerator->hooks;
++      
++      if (hooks && hooks->new_device(np->netdev, dev) == 0)
++              accelerator_set_vif_state_hooks(&np->accel_vif_state);
++
++      return;
++}
++
++
++/*  
++ * Request that a particular netfront accelerator plugin is loaded.
++ * Usually called as a result of the vif configuration specifying
++ * which one to use.
++ *
++ * Must be called with accelerator_mutex held.  Takes the
++ * vif_states_lock spinlock.
++ */
++static int netfront_load_accelerator(struct netfront_info *np, 
++                                   struct xenbus_device *dev, 
++                                   const char *frontend)
++{
++      struct netfront_accelerator *accelerator;
++      int rc = 0;
++
++      DPRINTK(" %s\n", frontend);
++
++      /* 
++       * Look at list of loaded accelerators to see if the requested
++       * one is already there 
++       */
++      list_for_each_entry(accelerator, &accelerators_list, link) {
++              if (match_accelerator(frontend, accelerator)) {
++                      accelerator_probe_new_vif(np, dev, accelerator);
++                      return 0;
++              }
++      }
++
++      /* Couldn't find it, so create a new one and load the module */
++      if ((rc = init_accelerator(frontend, &accelerator, NULL)) < 0) {
++              return rc;
++      }
++
++      /* Include this frontend device on the accelerator's list */
++      add_accelerator_vif(accelerator, np);
++
++      return rc;
++}
++
++
++/*
++ * Go through all the netfront vifs and see if they have requested
++ * this accelerator.  Notify the accelerator plugin of the relevant
++ * device if so.  Called when an accelerator plugin module is first
++ * loaded and connects to netfront.
++ *
++ * Must be called with accelerator_mutex held.  Takes the
++ * vif_states_lock spinlock.
++ */
++static void 
++accelerator_probe_vifs(struct netfront_accelerator *accelerator,
++                     struct netfront_accel_hooks *hooks)
++{
++      struct netfront_accel_vif_state *vif_state, *tmp;
++
++      DPRINTK("%p\n", accelerator);
++
++      /* 
++       * Store the hooks for future calls to probe a new device, and
++       * to wire into the vif_state once the accelerator plugin is
++       * ready to accelerate each vif
++       */
++      BUG_ON(hooks == NULL);
++      accelerator->hooks = hooks;
++
++      /* Holds accelerator_mutex to iterate list */
++      list_for_each_entry_safe(vif_state, tmp, &accelerator->vif_states,
++                               link) {
++              struct netfront_info *np = vif_state->np;
++              
++              if (hooks->new_device(np->netdev, vif_state->dev) == 0)
++                      accelerator_set_vif_state_hooks(vif_state);
++      }
++}
++
++
++/* 
++ * Called by the netfront accelerator plugin module when it has
++ * loaded.
++ *
++ * Takes the accelerator_mutex and vif_states_lock spinlock.
++ */
++int netfront_accelerator_loaded(int version, const char *frontend, 
++                              struct netfront_accel_hooks *hooks)
++{
++      struct netfront_accelerator *accelerator;
++
++      if (is_initial_xendomain())
++              return -EINVAL;
++
++      if (version != NETFRONT_ACCEL_VERSION) {
++              if (version > NETFRONT_ACCEL_VERSION) {
++                      /* Caller has higher version number, leave it
++                         up to them to decide whether to continue.
++                         They can re-call with a lower number if
++                         they're happy to be compatible with us */
++                      return NETFRONT_ACCEL_VERSION;
++              } else {
++                      /* We have a more recent version than caller.
++                         Currently reject, but may in future be able
++                         to be backwardly compatible */
++                      return -EPROTO;
++              }
++      }
++
++      mutex_lock(&accelerator_mutex);
++
++      /* 
++       * Look through list of accelerators to see if it has already
++       * been requested
++       */
++      list_for_each_entry(accelerator, &accelerators_list, link) {
++              if (match_accelerator(frontend, accelerator)) {
++                      accelerator_probe_vifs(accelerator, hooks);
++                      goto out;
++              }
++      }
++
++      /*
++       * If it wasn't in the list, add it now so that when it is
++       * requested the caller will find it
++       */
++      DPRINTK("Couldn't find matching accelerator (%s)\n",
++              frontend);
++
++      init_accelerator(frontend, &accelerator, hooks);
++
++ out:
++      mutex_unlock(&accelerator_mutex);
++      return 0;
++}
++EXPORT_SYMBOL_GPL(netfront_accelerator_loaded);
++
++
++/* 
++ * Remove the hooks from a single vif state.
++ * 
++ * Takes the vif_states_lock spinlock and may sleep.
++ */
++static void 
++accelerator_remove_single_hook(struct netfront_accelerator *accelerator,
++                             struct netfront_accel_vif_state *vif_state)
++{
++      unsigned long flags;
++
++      /* Make sure there are no data path operations going on */
++      napi_disable(&vif_state->np->napi);
++      netif_tx_lock_bh(vif_state->np->netdev);
++
++      spin_lock_irqsave(&accelerator->vif_states_lock, flags);
++
++      /* 
++       * Remove the hooks, but leave the vif_state on the
++       * accelerator's list as that signifies this vif is
++       * interested in using that accelerator if it becomes
++       * available again
++       */
++      vif_state->hooks = NULL;
++      
++      spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
++
++      netif_tx_unlock_bh(vif_state->np->netdev);
++      napi_enable(&vif_state->np->napi);
++}
++
++
++/* 
++ * Safely remove the accelerator function hooks from a netfront state.
++ * 
++ * Must be called with the accelerator mutex held.  Takes the
++ * vif_states_lock spinlock.
++ */
++static void accelerator_remove_hooks(struct netfront_accelerator *accelerator)
++{
++      struct netfront_accel_vif_state *vif_state, *tmp;
++      unsigned long flags;
++
++      /* Mutex is held to iterate list */
++      list_for_each_entry_safe(vif_state, tmp,
++                               &accelerator->vif_states,
++                               link) {
++              if(vif_state->hooks) {
++                      spin_lock_irqsave(&accelerator->vif_states_lock, flags);
++
++                      /* Last chance to get statistics from the accelerator */
++                      vif_state->hooks->get_stats(vif_state->np->netdev,
++                                                  &vif_state->np->netdev->stats);
++
++                      spin_unlock_irqrestore(&accelerator->vif_states_lock,
++                                             flags);
++
++                      accelerator_remove_single_hook(accelerator, vif_state);
++
++                      accelerator->hooks->remove(vif_state->dev);
++              }
++      }
++      
++      accelerator->hooks = NULL;
++}
++
++
++/* 
++ * Called by a netfront accelerator when it is unloaded.  This safely
++ * removes the hooks into the plugin and blocks until all devices have
++ * finished using it, so on return it is safe to unload.
++ *
++ * Takes the accelerator mutex, and vif_states_lock spinlock.
++ */
++void netfront_accelerator_stop(const char *frontend)
++{
++      struct netfront_accelerator *accelerator;
++
++      mutex_lock(&accelerator_mutex);
++
++      list_for_each_entry(accelerator, &accelerators_list, link) {
++              if (match_accelerator(frontend, accelerator)) {
++                      accelerator_remove_hooks(accelerator);
++                      goto out;
++              }
++      }
++ out:
++      mutex_unlock(&accelerator_mutex);
++}
++EXPORT_SYMBOL_GPL(netfront_accelerator_stop);
++
++
++/* 
++ * Helper for call_remove and do_suspend
++ * 
++ * Must be called with the accelerator mutex held.  Takes the
++ * vif_states_lock spinlock.
++ */
++static int do_remove(struct netfront_info *np, struct xenbus_device *dev)
++{
++      struct netfront_accelerator *accelerator = np->accelerator;
++      unsigned long flags;
++      int rc = 0;
++ 
++      if (np->accel_vif_state.hooks) {
++              spin_lock_irqsave(&accelerator->vif_states_lock, flags);
++
++              /* Last chance to get statistics from the accelerator */
++              np->accel_vif_state.hooks->get_stats(np->netdev,
++                                                   &np->netdev->stats);
++
++              spin_unlock_irqrestore(&accelerator->vif_states_lock, 
++                                     flags);
++
++              /* 
++               * Try and do the opposite of accelerator_probe_new_vif
++               * to ensure there's no state pointing back at the 
++               * netdev 
++               */
++              accelerator_remove_single_hook(accelerator, 
++                                             &np->accel_vif_state);
++
++              rc = accelerator->hooks->remove(dev);
++      }
++ 
++      return rc;
++}
++
++
++/*
++ * Must be called with the accelerator mutex held.  Takes the
++ * vif_states_lock spinlock
++ */
++static int netfront_remove_accelerator(struct netfront_info *np,
++                                     struct xenbus_device *dev)
++{
++      struct netfront_accelerator *accelerator;
++      struct netfront_accel_vif_state *tmp_vif_state;
++      int rc = 0; 
++
++      /* Check that we've got a device that was accelerated */
++      if (np->accelerator == NULL)
++              return rc;
++
++      accelerator = np->accelerator;
++
++      list_for_each_entry(tmp_vif_state, &accelerator->vif_states,
++                          link) {
++              if (tmp_vif_state == &np->accel_vif_state) {
++                      list_del(&np->accel_vif_state.link);
++                      break;
++              }
++      }
++
++      rc = do_remove(np, dev);
++
++      np->accelerator = NULL;
++
++      return rc;
++}
++
++
++/*
++ * No lock pre-requisites.  Takes the accelerator mutex and the
++ * vif_states_lock spinlock.
++ */
++int netfront_accelerator_call_remove(struct netfront_info *np,
++                                   struct xenbus_device *dev)
++{
++      int rc;
++      netfront_accelerator_remove_watch(np);
++      mutex_lock(&accelerator_mutex);
++      rc = netfront_remove_accelerator(np, dev);
++      mutex_unlock(&accelerator_mutex);
++      return rc;
++}
++
++
++/*
++ * No lock pre-requisites.  Takes the accelerator mutex and the
++ * vif_states_lock spinlock.
++ */
++int netfront_accelerator_suspend(struct netfront_info *np,
++                               struct xenbus_device *dev)
++{
++      int rc = 0;
++      
++      mutex_lock(&accelerator_mutex);
++
++      /* Check that we've got a device that was accelerated */
++      if (np->accelerator == NULL)
++              goto out;
++
++      /* 
++       * Call the remove accelerator hook, but leave the vif_state
++       * on the accelerator's list in case there is a suspend_cancel.
++       */
++      rc = do_remove(np, dev);
++ out:
++      mutex_unlock(&accelerator_mutex);
++      return rc;
++}
++  
++  
++int netfront_accelerator_suspend_cancel(struct netfront_info *np,
++                                      struct xenbus_device *dev)
++{
++      netfront_accelerator_purge_watch(&np->accel_vif_state);
++
++      /* 
++       * Gratuitously fire the watch handler to reinstate the
++       * configured accelerator
++       */
++      if (dev->state == XenbusStateConnected)
++              queue_work(accel_watch_workqueue, 
++                         &np->accel_vif_state.accel_work);
++
++      return 0;
++}
++
++
++/*
++ * No lock pre-requisites.  Takes the accelerator mutex
++ */
++void netfront_accelerator_resume(struct netfront_info *np,
++                               struct xenbus_device *dev)
++{
++      struct netfront_accel_vif_state *accel_vif_state = NULL;
++
++      mutex_lock(&accelerator_mutex);
++
++      /* Check that we've got a device that was accelerated */
++      if(np->accelerator == NULL)
++              goto out;
++
++      /* Find the vif_state from the accelerator's list */
++      list_for_each_entry(accel_vif_state, &np->accelerator->vif_states, 
++                          link) {
++              if (accel_vif_state->dev == dev) {
++                      BUG_ON(accel_vif_state != &np->accel_vif_state);
++ 
++                      /* 
++                       * Remove it from the accelerator's list so
++                       * state is consistent for probing new vifs
++                       * when they get connected
++                       */
++                      list_del(&accel_vif_state->link);
++                      np->accelerator = NULL;
++
++                      break;
++              }
++      }
++
++ out:
++      mutex_unlock(&accelerator_mutex);
++      return;
++}
++
++
++/*
++ * No lock pre-requisites.  Takes the vif_states_lock spinlock
++ */
++int netfront_check_accelerator_queue_ready(struct net_device *dev,
++                                         struct netfront_info *np)
++{
++      struct netfront_accelerator *accelerator;
++      int rc = 1;
++      unsigned long flags;
++
++      accelerator = np->accelerator;
++
++      /* Call the check_ready accelerator hook. */ 
++      if (np->accel_vif_state.hooks && accelerator) {
++              spin_lock_irqsave(&accelerator->vif_states_lock, flags); 
++              if (np->accel_vif_state.hooks &&
++                  np->accelerator == accelerator)
++                      rc = np->accel_vif_state.hooks->check_ready(dev);
++              spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
++      }
++
++      return rc;
++}
++
++
++/*
++ * No lock pre-requisites.  Takes the vif_states_lock spinlock
++ */
++void netfront_accelerator_call_stop_napi_irq(struct netfront_info *np,
++                                           struct net_device *dev)
++{
++      struct netfront_accelerator *accelerator;
++      unsigned long flags;
++
++      accelerator = np->accelerator;
++
++      /* Call the stop_napi_interrupts accelerator hook. */
++      if (np->accel_vif_state.hooks && accelerator != NULL) {
++              spin_lock_irqsave(&accelerator->vif_states_lock, flags); 
++              if (np->accel_vif_state.hooks &&
++                  np->accelerator == accelerator)
++                      np->accel_vif_state.hooks->stop_napi_irq(dev);
++              spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
++      }
++}
++
++
++/*
++ * No lock pre-requisites.  Takes the vif_states_lock spinlock
++ */
++int netfront_accelerator_call_get_stats(struct net_device *dev)
++{
++      struct netfront_info *np = netdev_priv(dev);
++      struct netfront_accelerator *accelerator;
++      unsigned long flags;
++      int rc = 0;
++
++      accelerator = np->accelerator;
++
++      /* Call the get_stats accelerator hook. */
++      if (np->accel_vif_state.hooks && accelerator != NULL) {
++              spin_lock_irqsave(&accelerator->vif_states_lock, flags); 
++              if (np->accel_vif_state.hooks && 
++                  np->accelerator == accelerator)
++                      rc = np->accel_vif_state.hooks->get_stats(dev,
++                                                                &dev->stats);
++              spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
++      }
++      return rc;
++}
++
diff --cc drivers/xen/netfront/netfront.c

index 0000000,0000000..f87a68d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/netfront/netfront.c
@@@ -1,0 -1,0 +1,2230 @@@
++/******************************************************************************
++ * Virtual network driver for conversing with remote driver backends.
++ *
++ * Copyright (c) 2002-2005, K A Fraser
++ * Copyright (c) 2005, XenSource Ltd
++ * Copyright (C) 2007 Solarflare Communications, Inc.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/module.h>
++#include <linux/version.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/string.h>
++#include <linux/errno.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/skbuff.h>
++#include <linux/init.h>
++#include <linux/bitops.h>
++#include <linux/ethtool.h>
++#include <linux/in.h>
++#include <linux/if_ether.h>
++#include <linux/io.h>
++#include <linux/moduleparam.h>
++#include <net/sock.h>
++#include <net/pkt_sched.h>
++#include <net/route.h>
++#include <asm/uaccess.h>
++#include <xen/evtchn.h>
++#include <xen/xenbus.h>
++#include <xen/interface/io/netif.h>
++#include <xen/interface/memory.h>
++#include <xen/balloon.h>
++#include <asm/page.h>
++#include <asm/maddr.h>
++#include <asm/uaccess.h>
++#include <xen/interface/grant_table.h>
++#include <xen/gnttab.h>
++#include <xen/net-util.h>
++
++struct netfront_cb {
++      struct page *page;
++      unsigned offset;
++};
++
++#define NETFRONT_SKB_CB(skb)  ((struct netfront_cb *)((skb)->cb))
++
++#include "netfront.h"
++
++/*
++ * Mutually-exclusive module options to select receive data path:
++ *  rx_copy : Packets are copied by network backend into local memory
++ *  rx_flip : Page containing packet data is transferred to our ownership
++ * For fully-virtualised guests there is no option - copying must be used.
++ * For paravirtualised guests, flipping is the default.
++ */
++#ifdef CONFIG_XEN
++static int MODPARM_rx_copy = 0;
++module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
++MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
++static int MODPARM_rx_flip = 0;
++module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
++MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
++#else
++static const int MODPARM_rx_copy = 1;
++static const int MODPARM_rx_flip = 0;
++#endif
++
++#define RX_COPY_THRESHOLD 256
++
++/* If we don't have GSO, fake things up so that we never try to use it. */
++#if defined(NETIF_F_GSO)
++#define HAVE_GSO                      1
++#define HAVE_TSO                      1 /* TSO is a subset of GSO */
++#define HAVE_CSUM_OFFLOAD             1
++static inline void dev_disable_gso_features(struct net_device *dev)
++{
++      /* Turn off all GSO bits except ROBUST. */
++      dev->features &= ~NETIF_F_GSO_MASK;
++      dev->features |= NETIF_F_GSO_ROBUST;
++}
++#elif defined(NETIF_F_TSO)
++#define HAVE_GSO                     0
++#define HAVE_TSO                       1
++
++/* Some older kernels cannot cope with incorrect checksums,
++ * particularly in netfilter. I'm not sure there is 100% correlation
++ * with the presence of NETIF_F_TSO but it appears to be a good first
++ * approximiation.
++ */
++#define HAVE_CSUM_OFFLOAD              0
++
++#define gso_size tso_size
++#define gso_segs tso_segs
++static inline void dev_disable_gso_features(struct net_device *dev)
++{
++       /* Turn off all TSO bits. */
++       dev->features &= ~NETIF_F_TSO;
++}
++static inline int skb_is_gso(const struct sk_buff *skb)
++{
++        return skb_shinfo(skb)->tso_size;
++}
++static inline int skb_gso_ok(struct sk_buff *skb, int features)
++{
++        return (features & NETIF_F_TSO);
++}
++
++#define netif_skb_features(skb) ((skb)->dev->features)
++static inline int netif_needs_gso(struct sk_buff *skb, int features)
++{
++        return skb_is_gso(skb) &&
++               (!skb_gso_ok(skb, features) ||
++                unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
++}
++#else
++#define HAVE_GSO                      0
++#define HAVE_TSO                      0
++#define HAVE_CSUM_OFFLOAD             0
++#define netif_needs_gso(skb, feat)    0
++#define dev_disable_gso_features(dev) ((void)0)
++#define ethtool_op_set_tso(dev, data) (-ENOSYS)
++#endif
++
++#define GRANT_INVALID_REF     0
++
++struct netfront_rx_info {
++      struct netif_rx_response rx;
++      struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
++};
++
++/*
++ * Implement our own carrier flag: the network stack's version causes delays
++ * when the carrier is re-enabled (in particular, dev_activate() may not
++ * immediately be called, which can cause packet loss).
++ */
++#define netfront_carrier_on(netif)    ((netif)->carrier = 1)
++#define netfront_carrier_off(netif)   ((netif)->carrier = 0)
++#define netfront_carrier_ok(netif)    ((netif)->carrier)
++
++/*
++ * Access macros for acquiring freeing slots in tx_skbs[].
++ */
++
++static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
++{
++      list[id] = list[0];
++      list[0]  = (void *)(unsigned long)id;
++}
++
++static inline unsigned short get_id_from_freelist(struct sk_buff **list)
++{
++      unsigned int id = (unsigned int)(unsigned long)list[0];
++      list[0] = list[id];
++      return id;
++}
++
++static inline int xennet_rxidx(RING_IDX idx)
++{
++      return idx & (NET_RX_RING_SIZE - 1);
++}
++
++static inline struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
++                                              RING_IDX ri)
++{
++      int i = xennet_rxidx(ri);
++      struct sk_buff *skb = np->rx_skbs[i];
++      np->rx_skbs[i] = NULL;
++      return skb;
++}
++
++static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
++                                          RING_IDX ri)
++{
++      int i = xennet_rxidx(ri);
++      grant_ref_t ref = np->grant_rx_ref[i];
++      np->grant_rx_ref[i] = GRANT_INVALID_REF;
++      return ref;
++}
++
++#define DPRINTK(fmt, args...)                         \
++      pr_debug("netfront (%s:%d) " fmt,               \
++               __FUNCTION__, __LINE__, ##args)
++#define IPRINTK(fmt, args...) pr_info("netfront: " fmt, ##args)
++#define WPRINTK(fmt, args...) pr_warning("netfront: " fmt, ##args)
++
++static int setup_device(struct xenbus_device *, struct netfront_info *);
++static struct net_device *create_netdev(struct xenbus_device *);
++
++static void end_access(int, void *);
++static void netif_disconnect_backend(struct netfront_info *);
++
++static int network_connect(struct net_device *);
++static void network_tx_buf_gc(struct net_device *);
++static void network_alloc_rx_buffers(struct net_device *);
++
++static irqreturn_t netif_int(int irq, void *dev_id);
++
++#ifdef CONFIG_SYSFS
++static int xennet_sysfs_addif(struct net_device *netdev);
++static void xennet_sysfs_delif(struct net_device *netdev);
++#else /* !CONFIG_SYSFS */
++#define xennet_sysfs_addif(dev) (0)
++#define xennet_sysfs_delif(dev) do { } while(0)
++#endif
++
++static inline int xennet_can_sg(struct net_device *dev)
++{
++      return dev->features & NETIF_F_SG;
++}
++
++/*
++ * Work around net.ipv4.conf.*.arp_notify no being enabled by default.
++ */
++static void __devinit netfront_enable_arp_notify(struct netfront_info *info)
++{
++#ifdef CONFIG_INET
++      struct in_device *in_dev;
++
++      rtnl_lock();
++      in_dev = __in_dev_get_rtnl(info->netdev);
++      if (in_dev && !IN_DEV_CONF_GET(in_dev, ARP_NOTIFY))
++              IN_DEV_CONF_SET(in_dev, ARP_NOTIFY, 1);
++      rtnl_unlock();
++      if (!in_dev)
++              printk(KERN_WARNING "Cannot enable ARP notification on %s\n",
++                     info->xbdev->nodename);
++#endif
++}
++
++/**
++ * Entry point to this code when a new device is created.  Allocate the basic
++ * structures and the ring buffers for communication with the backend, and
++ * inform the backend of the appropriate details for those.
++ */
++static int __devinit netfront_probe(struct xenbus_device *dev,
++                                  const struct xenbus_device_id *id)
++{
++      int err;
++      struct net_device *netdev;
++      struct netfront_info *info;
++
++      netdev = create_netdev(dev);
++      if (IS_ERR(netdev)) {
++              err = PTR_ERR(netdev);
++              xenbus_dev_fatal(dev, err, "creating netdev");
++              return err;
++      }
++
++      info = netdev_priv(netdev);
++      dev_set_drvdata(&dev->dev, info);
++
++      err = register_netdev(info->netdev);
++      if (err) {
++              pr_warning("%s: register_netdev err=%d\n",
++                         __FUNCTION__, err);
++              goto fail;
++      }
++
++      netfront_enable_arp_notify(info);
++
++      err = xennet_sysfs_addif(info->netdev);
++      if (err) {
++              unregister_netdev(info->netdev);
++              pr_warning("%s: add sysfs failed err=%d\n",
++                         __FUNCTION__, err);
++              goto fail;
++      }
++
++      return 0;
++
++ fail:
++      free_netdev(netdev);
++      dev_set_drvdata(&dev->dev, NULL);
++      return err;
++}
++
++static int __devexit netfront_remove(struct xenbus_device *dev)
++{
++      struct netfront_info *info = dev_get_drvdata(&dev->dev);
++
++      DPRINTK("%s\n", dev->nodename);
++
++      netfront_accelerator_call_remove(info, dev);
++
++      netif_disconnect_backend(info);
++
++      del_timer_sync(&info->rx_refill_timer);
++
++      xennet_sysfs_delif(info->netdev);
++
++      unregister_netdev(info->netdev);
++
++      free_netdev(info->netdev);
++
++      return 0;
++}
++
++
++static int netfront_suspend(struct xenbus_device *dev)
++{
++      struct netfront_info *info = dev_get_drvdata(&dev->dev);
++      return netfront_accelerator_suspend(info, dev);
++}
++
++
++static int netfront_suspend_cancel(struct xenbus_device *dev)
++{
++      struct netfront_info *info = dev_get_drvdata(&dev->dev);
++      return netfront_accelerator_suspend_cancel(info, dev);
++}
++
++
++/**
++ * We are reconnecting to the backend, due to a suspend/resume, or a backend
++ * driver restart.  We tear down our netif structure and recreate it, but
++ * leave the device-layer structures intact so that this is transparent to the
++ * rest of the kernel.
++ */
++static int netfront_resume(struct xenbus_device *dev)
++{
++      struct netfront_info *info = dev_get_drvdata(&dev->dev);
++
++      DPRINTK("%s\n", dev->nodename);
++
++      netfront_accelerator_resume(info, dev);
++
++      netif_disconnect_backend(info);
++      return 0;
++}
++
++static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
++{
++      char *s, *e, *macstr;
++      int i;
++
++      macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
++      if (IS_ERR(macstr))
++              return PTR_ERR(macstr);
++
++      for (i = 0; i < ETH_ALEN; i++) {
++              mac[i] = simple_strtoul(s, &e, 16);
++              if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
++                      kfree(macstr);
++                      return -ENOENT;
++              }
++              s = e+1;
++      }
++
++      kfree(macstr);
++      return 0;
++}
++
++/* Common code used when first setting up, and when resuming. */
++static int talk_to_backend(struct xenbus_device *dev,
++                         struct netfront_info *info)
++{
++      const char *message;
++      struct xenbus_transaction xbt;
++      int err;
++
++      /* Read mac only in the first setup. */
++      if (!is_valid_ether_addr(info->mac)) {
++              err = xen_net_read_mac(dev, info->mac);
++              if (err) {
++                      xenbus_dev_fatal(dev, err, "parsing %s/mac",
++                                       dev->nodename);
++                      goto out;
++              }
++      }
++
++      /* Create shared ring, alloc event channel. */
++      err = setup_device(dev, info);
++      if (err)
++              goto out;
++
++      /* This will load an accelerator if one is configured when the
++       * watch fires */
++      netfront_accelerator_add_watch(info);
++
++again:
++      err = xenbus_transaction_start(&xbt);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "starting transaction");
++              goto destroy_ring;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
++                          info->tx_ring_ref);
++      if (err) {
++              message = "writing tx ring-ref";
++              goto abort_transaction;
++      }
++      err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
++                          info->rx_ring_ref);
++      if (err) {
++              message = "writing rx ring-ref";
++              goto abort_transaction;
++      }
++      err = xenbus_printf(xbt, dev->nodename,
++                          "event-channel", "%u",
++                          irq_to_evtchn_port(info->irq));
++      if (err) {
++              message = "writing event-channel";
++              goto abort_transaction;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
++                          info->copying_receiver);
++      if (err) {
++              message = "writing request-rx-copy";
++              goto abort_transaction;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
++      if (err) {
++              message = "writing feature-rx-notify";
++              goto abort_transaction;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload",
++                          "%d", !HAVE_CSUM_OFFLOAD);
++      if (err) {
++              message = "writing feature-no-csum-offload";
++              goto abort_transaction;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
++      if (err) {
++              message = "writing feature-sg";
++              goto abort_transaction;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d",
++                          HAVE_TSO);
++      if (err) {
++              message = "writing feature-gso-tcpv4";
++              goto abort_transaction;
++      }
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err) {
++              if (err == -EAGAIN)
++                      goto again;
++              xenbus_dev_fatal(dev, err, "completing transaction");
++              goto destroy_ring;
++      }
++
++      return 0;
++
++ abort_transaction:
++      xenbus_transaction_end(xbt, 1);
++      xenbus_dev_fatal(dev, err, "%s", message);
++ destroy_ring:
++      netfront_accelerator_call_remove(info, dev);
++      netif_disconnect_backend(info);
++ out:
++      return err;
++}
++
++static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
++{
++      struct netif_tx_sring *txs;
++      struct netif_rx_sring *rxs;
++      int err;
++      struct net_device *netdev = info->netdev;
++
++      info->tx_ring_ref = GRANT_INVALID_REF;
++      info->rx_ring_ref = GRANT_INVALID_REF;
++      info->rx.sring = NULL;
++      info->tx.sring = NULL;
++      info->irq = 0;
++
++      txs = (struct netif_tx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
++      if (!txs) {
++              err = -ENOMEM;
++              xenbus_dev_fatal(dev, err, "allocating tx ring page");
++              goto fail;
++      }
++      SHARED_RING_INIT(txs);
++      FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
++
++      err = xenbus_grant_ring(dev, virt_to_mfn(txs));
++      if (err < 0) {
++              free_page((unsigned long)txs);
++              goto fail;
++      }
++      info->tx_ring_ref = err;
++
++      rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
++      if (!rxs) {
++              err = -ENOMEM;
++              xenbus_dev_fatal(dev, err, "allocating rx ring page");
++              goto fail;
++      }
++      SHARED_RING_INIT(rxs);
++      FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
++
++      err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
++      if (err < 0) {
++              free_page((unsigned long)rxs);
++              goto fail;
++      }
++      info->rx_ring_ref = err;
++
++      memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
++
++      err = bind_listening_port_to_irqhandler(
++              dev->otherend_id, netif_int, IRQF_SAMPLE_RANDOM, netdev->name,
++              netdev);
++      if (err < 0)
++              goto fail;
++      info->irq = err;
++
++      return 0;
++
++ fail:
++      return err;
++}
++
++/**
++ * Callback received when the backend's state changes.
++ */
++static void backend_changed(struct xenbus_device *dev,
++                          enum xenbus_state backend_state)
++{
++      struct netfront_info *np = dev_get_drvdata(&dev->dev);
++      struct net_device *netdev = np->netdev;
++
++      DPRINTK("%s\n", xenbus_strstate(backend_state));
++
++      switch (backend_state) {
++      case XenbusStateInitialising:
++      case XenbusStateInitialised:
++      case XenbusStateConnected:
++      case XenbusStateReconfiguring:
++      case XenbusStateReconfigured:
++      case XenbusStateUnknown:
++      case XenbusStateClosed:
++              break;
++
++      case XenbusStateInitWait:
++              if (dev->state != XenbusStateInitialising)
++                      break;
++              if (network_connect(netdev) != 0)
++                      break;
++              xenbus_switch_state(dev, XenbusStateConnected);
++              netif_notify_peers(netdev);
++              break;
++
++      case XenbusStateClosing:
++              xenbus_frontend_closed(dev);
++              break;
++      }
++}
++
++static inline int netfront_tx_slot_available(struct netfront_info *np)
++{
++      return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
++              (TX_MAX_TARGET - MAX_SKB_FRAGS - 2));
++}
++
++
++static inline void network_maybe_wake_tx(struct net_device *dev)
++{
++      struct netfront_info *np = netdev_priv(dev);
++
++      if (unlikely(netif_queue_stopped(dev)) &&
++          netfront_tx_slot_available(np) &&
++          likely(netif_running(dev)) &&
++          netfront_check_accelerator_queue_ready(dev, np))
++              netif_wake_queue(dev);
++}
++
++
++int netfront_check_queue_ready(struct net_device *dev)
++{
++      struct netfront_info *np = netdev_priv(dev);
++
++      return unlikely(netif_queue_stopped(dev)) &&
++              netfront_tx_slot_available(np) &&
++              likely(netif_running(dev));
++}
++EXPORT_SYMBOL(netfront_check_queue_ready);
++
++static int network_open(struct net_device *dev)
++{
++      struct netfront_info *np = netdev_priv(dev);
++
++      napi_enable(&np->napi);
++
++      spin_lock_bh(&np->rx_lock);
++      if (netfront_carrier_ok(np)) {
++              network_alloc_rx_buffers(dev);
++              np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
++              if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){
++                      netfront_accelerator_call_stop_napi_irq(np, dev);
++
++                      napi_schedule(&np->napi);
++              }
++      }
++      spin_unlock_bh(&np->rx_lock);
++
++      netif_start_queue(dev);
++
++      return 0;
++}
++
++static void network_tx_buf_gc(struct net_device *dev)
++{
++      RING_IDX cons, prod;
++      unsigned short id;
++      struct netfront_info *np = netdev_priv(dev);
++      struct sk_buff *skb;
++
++      BUG_ON(!netfront_carrier_ok(np));
++
++      do {
++              prod = np->tx.sring->rsp_prod;
++              rmb(); /* Ensure we see responses up to 'rp'. */
++
++              for (cons = np->tx.rsp_cons; cons != prod; cons++) {
++                      struct netif_tx_response *txrsp;
++
++                      txrsp = RING_GET_RESPONSE(&np->tx, cons);
++                      if (txrsp->status == XEN_NETIF_RSP_NULL)
++                              continue;
++
++                      id  = txrsp->id;
++                      skb = np->tx_skbs[id];
++                      if (unlikely(gnttab_query_foreign_access(
++                              np->grant_tx_ref[id]) != 0)) {
++                              pr_alert("network_tx_buf_gc: grant still"
++                                       " in use by backend domain\n");
++                              BUG();
++                      }
++                      gnttab_end_foreign_access_ref(np->grant_tx_ref[id]);
++                      gnttab_release_grant_reference(
++                              &np->gref_tx_head, np->grant_tx_ref[id]);
++                      np->grant_tx_ref[id] = GRANT_INVALID_REF;
++                      add_id_to_freelist(np->tx_skbs, id);
++                      dev_kfree_skb_irq(skb);
++              }
++
++              np->tx.rsp_cons = prod;
++
++              /*
++               * Set a new event, then check for race with update of tx_cons.
++               * Note that it is essential to schedule a callback, no matter
++               * how few buffers are pending. Even if there is space in the
++               * transmit ring, higher layers may be blocked because too much
++               * data is outstanding: in such cases notification from Xen is
++               * likely to be the only kick that we'll get.
++               */
++              np->tx.sring->rsp_event =
++                      prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
++              mb();
++      } while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
++
++      network_maybe_wake_tx(dev);
++}
++
++static void rx_refill_timeout(unsigned long data)
++{
++      struct net_device *dev = (struct net_device *)data;
++      struct netfront_info *np = netdev_priv(dev);
++
++      netfront_accelerator_call_stop_napi_irq(np, dev);
++
++      napi_schedule(&np->napi);
++}
++
++static void network_alloc_rx_buffers(struct net_device *dev)
++{
++      unsigned short id;
++      struct netfront_info *np = netdev_priv(dev);
++      struct sk_buff *skb;
++      struct page *page;
++      int i, batch_target, notify;
++      RING_IDX req_prod = np->rx.req_prod_pvt;
++      struct xen_memory_reservation reservation;
++      grant_ref_t ref;
++      unsigned long pfn;
++      void *vaddr;
++      int nr_flips;
++      netif_rx_request_t *req;
++
++      if (unlikely(!netfront_carrier_ok(np)))
++              return;
++
++      /*
++       * Allocate skbuffs greedily, even though we batch updates to the
++       * receive ring. This creates a less bursty demand on the memory
++       * allocator, so should reduce the chance of failed allocation requests
++       * both for ourself and for other kernel subsystems.
++       */
++      batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
++      for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
++              /*
++               * Allocate an skb and a page. Do not use __dev_alloc_skb as
++               * that will allocate page-sized buffers which is not
++               * necessary here.
++               * 16 bytes added as necessary headroom for netif_receive_skb.
++               */
++              skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN,
++                              GFP_ATOMIC | __GFP_NOWARN);
++              if (unlikely(!skb))
++                      goto no_skb;
++
++              page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
++              if (!page) {
++                      kfree_skb(skb);
++no_skb:
++                      /* Any skbuffs queued for refill? Force them out. */
++                      if (i != 0)
++                              goto refill;
++                      /* Could not allocate any skbuffs. Try again later. */
++                      mod_timer(&np->rx_refill_timer,
++                                jiffies + (HZ/10));
++                      break;
++              }
++
++              skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */
++              skb_shinfo(skb)->frags[0].page = page;
++              skb_shinfo(skb)->nr_frags = 1;
++              __skb_queue_tail(&np->rx_batch, skb);
++      }
++
++      /* Is the batch large enough to be worthwhile? */
++      if (i < (np->rx_target/2)) {
++              if (req_prod > np->rx.sring->req_prod)
++                      goto push;
++              return;
++      }
++
++      /* Adjust our fill target if we risked running out of buffers. */
++      if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
++          ((np->rx_target *= 2) > np->rx_max_target))
++              np->rx_target = np->rx_max_target;
++
++ refill:
++      for (nr_flips = i = 0; ; i++) {
++              if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
++                      break;
++
++              skb->dev = dev;
++
++              id = xennet_rxidx(req_prod + i);
++
++              BUG_ON(np->rx_skbs[id]);
++              np->rx_skbs[id] = skb;
++
++              ref = gnttab_claim_grant_reference(&np->gref_rx_head);
++              BUG_ON((signed short)ref < 0);
++              np->grant_rx_ref[id] = ref;
++
++              pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
++              vaddr = page_address(skb_shinfo(skb)->frags[0].page);
++
++              req = RING_GET_REQUEST(&np->rx, req_prod + i);
++              if (!np->copying_receiver) {
++                      gnttab_grant_foreign_transfer_ref(ref,
++                                                        np->xbdev->otherend_id,
++                                                        pfn);
++                      np->rx_pfn_array[nr_flips] = pfn_to_mfn(pfn);
++                      if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++                              /* Remove this page before passing
++                               * back to Xen. */
++                              set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
++                              MULTI_update_va_mapping(np->rx_mcl+i,
++                                                      (unsigned long)vaddr,
++                                                      __pte(0), 0);
++                      }
++                      nr_flips++;
++              } else {
++                      gnttab_grant_foreign_access_ref(ref,
++                                                      np->xbdev->otherend_id,
++                                                      pfn_to_mfn(pfn),
++                                                      0);
++              }
++
++              req->id = id;
++              req->gref = ref;
++      }
++
++      if ( nr_flips != 0 ) {
++              /* Tell the ballon driver what is going on. */
++              balloon_update_driver_allowance(i);
++
++              set_xen_guest_handle(reservation.extent_start,
++                                   np->rx_pfn_array);
++              reservation.nr_extents   = nr_flips;
++              reservation.extent_order = 0;
++              reservation.address_bits = 0;
++              reservation.domid        = DOMID_SELF;
++
++              if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++                      /* After all PTEs have been zapped, flush the TLB. */
++                      np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
++                              UVMF_TLB_FLUSH|UVMF_ALL;
++
++                      /* Give away a batch of pages. */
++                      np->rx_mcl[i].op = __HYPERVISOR_memory_op;
++                      np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
++                      np->rx_mcl[i].args[1] = (unsigned long)&reservation;
++
++                      /* Zap PTEs and give away pages in one big
++                       * multicall. */
++                      if (unlikely(HYPERVISOR_multicall(np->rx_mcl, i+1)))
++                              BUG();
++
++                      /* Check return status of HYPERVISOR_memory_op(). */
++                      if (unlikely(np->rx_mcl[i].result != i))
++                              panic("Unable to reduce memory reservation\n");
++                      while (nr_flips--)
++                              BUG_ON(np->rx_mcl[nr_flips].result);
++              } else {
++                      if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
++                                               &reservation) != i)
++                              panic("Unable to reduce memory reservation\n");
++              }
++      } else {
++              wmb();
++      }
++
++      /* Above is a suitable barrier to ensure backend will see requests. */
++      np->rx.req_prod_pvt = req_prod + i;
++ push:
++      RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
++      if (notify)
++              notify_remote_via_irq(np->irq);
++}
++
++static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
++                            struct netif_tx_request *tx)
++{
++      struct netfront_info *np = netdev_priv(dev);
++      char *data = skb->data;
++      unsigned long mfn;
++      RING_IDX prod = np->tx.req_prod_pvt;
++      int frags = skb_shinfo(skb)->nr_frags;
++      unsigned int offset = offset_in_page(data);
++      unsigned int len = skb_headlen(skb);
++      unsigned int id;
++      grant_ref_t ref;
++      int i;
++
++      while (len > PAGE_SIZE - offset) {
++              tx->size = PAGE_SIZE - offset;
++              tx->flags |= XEN_NETTXF_more_data;
++              len -= tx->size;
++              data += tx->size;
++              offset = 0;
++
++              id = get_id_from_freelist(np->tx_skbs);
++              np->tx_skbs[id] = skb_get(skb);
++              tx = RING_GET_REQUEST(&np->tx, prod++);
++              tx->id = id;
++              ref = gnttab_claim_grant_reference(&np->gref_tx_head);
++              BUG_ON((signed short)ref < 0);
++
++              mfn = virt_to_mfn(data);
++              gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
++                                              mfn, GTF_readonly);
++
++              tx->gref = np->grant_tx_ref[id] = ref;
++              tx->offset = offset;
++              tx->size = len;
++              tx->flags = 0;
++      }
++
++      for (i = 0; i < frags; i++) {
++              skb_frag_t *frag = skb_shinfo(skb)->frags + i;
++
++              tx->flags |= XEN_NETTXF_more_data;
++
++              id = get_id_from_freelist(np->tx_skbs);
++              np->tx_skbs[id] = skb_get(skb);
++              tx = RING_GET_REQUEST(&np->tx, prod++);
++              tx->id = id;
++              ref = gnttab_claim_grant_reference(&np->gref_tx_head);
++              BUG_ON((signed short)ref < 0);
++
++              mfn = pfn_to_mfn(page_to_pfn(frag->page));
++              gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
++                                              mfn, GTF_readonly);
++
++              tx->gref = np->grant_tx_ref[id] = ref;
++              tx->offset = frag->page_offset;
++              tx->size = frag->size;
++              tx->flags = 0;
++      }
++
++      np->tx.req_prod_pvt = prod;
++}
++
++static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++      unsigned short id;
++      struct netfront_info *np = netdev_priv(dev);
++      struct netif_tx_request *tx;
++      struct netif_extra_info *extra;
++      char *data = skb->data;
++      RING_IDX i;
++      grant_ref_t ref;
++      unsigned long mfn;
++      int notify;
++      int frags = skb_shinfo(skb)->nr_frags;
++      unsigned int offset = offset_in_page(data);
++      unsigned int len = skb_headlen(skb);
++
++      /* Check the fast path, if hooks are available */
++      if (np->accel_vif_state.hooks && 
++          np->accel_vif_state.hooks->start_xmit(skb, dev)) { 
++              /* Fast path has sent this packet */ 
++              return NETDEV_TX_OK;
++      } 
++
++      frags += DIV_ROUND_UP(offset + len, PAGE_SIZE);
++      if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
++              pr_alert("xennet: skb rides the rocket: %d frags\n", frags);
++              dump_stack();
++              goto drop;
++      }
++
++      spin_lock_irq(&np->tx_lock);
++
++      if (unlikely(!netfront_carrier_ok(np) ||
++                   (frags > 1 && !xennet_can_sg(dev)) ||
++                   netif_needs_gso(skb, netif_skb_features(skb)))) {
++              spin_unlock_irq(&np->tx_lock);
++              goto drop;
++      }
++
++      i = np->tx.req_prod_pvt;
++
++      id = get_id_from_freelist(np->tx_skbs);
++      np->tx_skbs[id] = skb;
++
++      tx = RING_GET_REQUEST(&np->tx, i);
++
++      tx->id   = id;
++      ref = gnttab_claim_grant_reference(&np->gref_tx_head);
++      BUG_ON((signed short)ref < 0);
++      mfn = virt_to_mfn(data);
++      gnttab_grant_foreign_access_ref(
++              ref, np->xbdev->otherend_id, mfn, GTF_readonly);
++      tx->gref = np->grant_tx_ref[id] = ref;
++      tx->offset = offset;
++      tx->size = len;
++
++      tx->flags = 0;
++      extra = NULL;
++
++      if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
++              tx->flags |= XEN_NETTXF_csum_blank | XEN_NETTXF_data_validated;
++      else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
++              tx->flags |= XEN_NETTXF_data_validated;
++
++#if HAVE_TSO
++      if (skb_shinfo(skb)->gso_size) {
++              struct netif_extra_info *gso = (struct netif_extra_info *)
++                      RING_GET_REQUEST(&np->tx, ++i);
++
++              if (extra)
++                      extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
++              else
++                      tx->flags |= XEN_NETTXF_extra_info;
++
++              gso->u.gso.size = skb_shinfo(skb)->gso_size;
++              gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
++              gso->u.gso.pad = 0;
++              gso->u.gso.features = 0;
++
++              gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
++              gso->flags = 0;
++              extra = gso;
++      }
++#endif
++
++      np->tx.req_prod_pvt = i + 1;
++
++      xennet_make_frags(skb, dev, tx);
++      tx->size = skb->len;
++
++      RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
++      if (notify)
++              notify_remote_via_irq(np->irq);
++
++      dev->stats.tx_bytes += skb->len;
++      dev->stats.tx_packets++;
++      dev->trans_start = jiffies;
++
++      /* Note: It is not safe to access skb after network_tx_buf_gc()! */
++      network_tx_buf_gc(dev);
++
++      if (!netfront_tx_slot_available(np))
++              netif_stop_queue(dev);
++
++      spin_unlock_irq(&np->tx_lock);
++
++      return NETDEV_TX_OK;
++
++ drop:
++      dev->stats.tx_dropped++;
++      dev_kfree_skb(skb);
++      return NETDEV_TX_OK;
++}
++
++static irqreturn_t netif_int(int irq, void *dev_id)
++{
++      struct net_device *dev = dev_id;
++      struct netfront_info *np = netdev_priv(dev);
++      unsigned long flags;
++
++      spin_lock_irqsave(&np->tx_lock, flags);
++
++      if (likely(netfront_carrier_ok(np))) {
++              network_tx_buf_gc(dev);
++              /* Under tx_lock: protects access to rx shared-ring indexes. */
++              if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) {
++                      netfront_accelerator_call_stop_napi_irq(np, dev);
++
++                      napi_schedule(&np->napi);
++              }
++      }
++
++      spin_unlock_irqrestore(&np->tx_lock, flags);
++
++      return IRQ_HANDLED;
++}
++
++static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
++                              grant_ref_t ref)
++{
++      int new = xennet_rxidx(np->rx.req_prod_pvt);
++
++      BUG_ON(np->rx_skbs[new]);
++      np->rx_skbs[new] = skb;
++      np->grant_rx_ref[new] = ref;
++      RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
++      RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
++      np->rx.req_prod_pvt++;
++}
++
++int xennet_get_extras(struct netfront_info *np,
++                    struct netif_extra_info *extras, RING_IDX rp)
++
++{
++      struct netif_extra_info *extra;
++      RING_IDX cons = np->rx.rsp_cons;
++      int err = 0;
++
++      do {
++              struct sk_buff *skb;
++              grant_ref_t ref;
++
++              if (unlikely(cons + 1 == rp)) {
++                      if (net_ratelimit())
++                              WPRINTK("Missing extra info\n");
++                      err = -EBADR;
++                      break;
++              }
++
++              extra = (struct netif_extra_info *)
++                      RING_GET_RESPONSE(&np->rx, ++cons);
++
++              if (unlikely(!extra->type ||
++                           extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
++                      if (net_ratelimit())
++                              WPRINTK("Invalid extra type: %d\n",
++                                      extra->type);
++                      err = -EINVAL;
++              } else {
++                      memcpy(&extras[extra->type - 1], extra,
++                             sizeof(*extra));
++              }
++
++              skb = xennet_get_rx_skb(np, cons);
++              ref = xennet_get_rx_ref(np, cons);
++              xennet_move_rx_slot(np, skb, ref);
++      } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
++
++      np->rx.rsp_cons = cons;
++      return err;
++}
++
++static int xennet_get_responses(struct netfront_info *np,
++                              struct netfront_rx_info *rinfo, RING_IDX rp,
++                              struct sk_buff_head *list,
++                              int *pages_flipped_p)
++{
++      int pages_flipped = *pages_flipped_p;
++      struct mmu_update *mmu;
++      struct multicall_entry *mcl;
++      struct netif_rx_response *rx = &rinfo->rx;
++      struct netif_extra_info *extras = rinfo->extras;
++      RING_IDX cons = np->rx.rsp_cons;
++      struct sk_buff *skb = xennet_get_rx_skb(np, cons);
++      grant_ref_t ref = xennet_get_rx_ref(np, cons);
++      int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
++      int frags = 1;
++      int err = 0;
++      unsigned long ret;
++
++      if (rx->flags & XEN_NETRXF_extra_info) {
++              err = xennet_get_extras(np, extras, rp);
++              cons = np->rx.rsp_cons;
++      }
++
++      for (;;) {
++              unsigned long mfn;
++
++              if (unlikely(rx->status < 0 ||
++                           rx->offset + rx->status > PAGE_SIZE)) {
++                      if (net_ratelimit())
++                              WPRINTK("rx->offset: %x, size: %u\n",
++                                      rx->offset, rx->status);
++                      xennet_move_rx_slot(np, skb, ref);
++                      err = -EINVAL;
++                      goto next;
++              }
++
++              /*
++               * This definitely indicates a bug, either in this driver or in
++               * the backend driver. In future this should flag the bad
++               * situation to the system controller to reboot the backed.
++               */
++              if (ref == GRANT_INVALID_REF) {
++                      if (net_ratelimit())
++                              WPRINTK("Bad rx response id %d.\n", rx->id);
++                      err = -EINVAL;
++                      goto next;
++              }
++
++              if (!np->copying_receiver) {
++                      /* Memory pressure, insufficient buffer
++                       * headroom, ... */
++                      if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) {
++                              if (net_ratelimit())
++                                      WPRINTK("Unfulfilled rx req "
++                                              "(id=%d, st=%d).\n",
++                                              rx->id, rx->status);
++                              xennet_move_rx_slot(np, skb, ref);
++                              err = -ENOMEM;
++                              goto next;
++                      }
++
++                      if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++                              /* Remap the page. */
++                              struct page *page =
++                                      skb_shinfo(skb)->frags[0].page;
++                              unsigned long pfn = page_to_pfn(page);
++                              void *vaddr = page_address(page);
++
++                              mcl = np->rx_mcl + pages_flipped;
++                              mmu = np->rx_mmu + pages_flipped;
++
++                              MULTI_update_va_mapping(mcl,
++                                                      (unsigned long)vaddr,
++                                                      pfn_pte_ma(mfn,
++                                                                 PAGE_KERNEL),
++                                                      0);
++                              mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
++                                      | MMU_MACHPHYS_UPDATE;
++                              mmu->val = pfn;
++
++                              set_phys_to_machine(pfn, mfn);
++                      }
++                      pages_flipped++;
++              } else {
++                      ret = gnttab_end_foreign_access_ref(ref);
++                      BUG_ON(!ret);
++              }
++
++              gnttab_release_grant_reference(&np->gref_rx_head, ref);
++
++              __skb_queue_tail(list, skb);
++
++next:
++              if (!(rx->flags & XEN_NETRXF_more_data))
++                      break;
++
++              if (cons + frags == rp) {
++                      if (net_ratelimit())
++                              WPRINTK("Need more frags\n");
++                      err = -ENOENT;
++                      break;
++              }
++
++              rx = RING_GET_RESPONSE(&np->rx, cons + frags);
++              skb = xennet_get_rx_skb(np, cons + frags);
++              ref = xennet_get_rx_ref(np, cons + frags);
++              frags++;
++      }
++
++      if (unlikely(frags > max)) {
++              if (net_ratelimit())
++                      WPRINTK("Too many frags\n");
++              err = -E2BIG;
++      }
++
++      if (unlikely(err))
++              np->rx.rsp_cons = cons + frags;
++
++      *pages_flipped_p = pages_flipped;
++
++      return err;
++}
++
++static RING_IDX xennet_fill_frags(struct netfront_info *np,
++                                struct sk_buff *skb,
++                                struct sk_buff_head *list)
++{
++      struct skb_shared_info *shinfo = skb_shinfo(skb);
++      int nr_frags = shinfo->nr_frags;
++      RING_IDX cons = np->rx.rsp_cons;
++      skb_frag_t *frag = shinfo->frags + nr_frags;
++      struct sk_buff *nskb;
++
++      while ((nskb = __skb_dequeue(list))) {
++              struct netif_rx_response *rx =
++                      RING_GET_RESPONSE(&np->rx, ++cons);
++
++              frag->page = skb_shinfo(nskb)->frags[0].page;
++              frag->page_offset = rx->offset;
++              frag->size = rx->status;
++
++              skb->data_len += rx->status;
++
++              skb_shinfo(nskb)->nr_frags = 0;
++              kfree_skb(nskb);
++
++              frag++;
++              nr_frags++;
++      }
++
++      shinfo->nr_frags = nr_frags;
++      return cons;
++}
++
++static int xennet_set_skb_gso(struct sk_buff *skb,
++                            struct netif_extra_info *gso)
++{
++      if (!gso->u.gso.size) {
++              if (net_ratelimit())
++                      WPRINTK("GSO size must not be zero.\n");
++              return -EINVAL;
++      }
++
++      /* Currently only TCPv4 S.O. is supported. */
++      if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
++              if (net_ratelimit())
++                      WPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
++              return -EINVAL;
++      }
++
++#if HAVE_TSO
++      skb_shinfo(skb)->gso_size = gso->u.gso.size;
++#if HAVE_GSO
++      skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
++
++      /* Header must be checked, and gso_segs computed. */
++      skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
++#endif
++      skb_shinfo(skb)->gso_segs = 0;
++
++      return 0;
++#else
++      if (net_ratelimit())
++              WPRINTK("GSO unsupported by this kernel.\n");
++      return -EINVAL;
++#endif
++}
++
++static int netif_poll(struct napi_struct *napi, int budget)
++{
++      struct netfront_info *np = container_of(napi, struct netfront_info, napi);
++      struct net_device *dev = np->netdev;
++      struct sk_buff *skb;
++      struct netfront_rx_info rinfo;
++      struct netif_rx_response *rx = &rinfo.rx;
++      struct netif_extra_info *extras = rinfo.extras;
++      RING_IDX i, rp;
++      struct multicall_entry *mcl;
++      int work_done, more_to_do = 1, accel_more_to_do = 1;
++      struct sk_buff_head rxq;
++      struct sk_buff_head errq;
++      struct sk_buff_head tmpq;
++      unsigned long flags;
++      unsigned int len;
++      int pages_flipped = 0;
++      int err;
++
++      spin_lock(&np->rx_lock); /* no need for spin_lock_bh() in ->poll() */
++
++      if (unlikely(!netfront_carrier_ok(np))) {
++              spin_unlock(&np->rx_lock);
++              return 0;
++      }
++
++      skb_queue_head_init(&rxq);
++      skb_queue_head_init(&errq);
++      skb_queue_head_init(&tmpq);
++
++      rp = np->rx.sring->rsp_prod;
++      rmb(); /* Ensure we see queued responses up to 'rp'. */
++
++      i = np->rx.rsp_cons;
++      work_done = 0;
++      while ((i != rp) && (work_done < budget)) {
++              memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
++              memset(extras, 0, sizeof(rinfo.extras));
++
++              err = xennet_get_responses(np, &rinfo, rp, &tmpq,
++                                         &pages_flipped);
++
++              if (unlikely(err)) {
++err:  
++                      while ((skb = __skb_dequeue(&tmpq)))
++                              __skb_queue_tail(&errq, skb);
++                      dev->stats.rx_errors++;
++                      i = np->rx.rsp_cons;
++                      continue;
++              }
++
++              skb = __skb_dequeue(&tmpq);
++
++              if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
++                      struct netif_extra_info *gso;
++                      gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
++
++                      if (unlikely(xennet_set_skb_gso(skb, gso))) {
++                              __skb_queue_head(&tmpq, skb);
++                              np->rx.rsp_cons += skb_queue_len(&tmpq);
++                              goto err;
++                      }
++              }
++
++              NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page;
++              NETFRONT_SKB_CB(skb)->offset = rx->offset;
++
++              len = rx->status;
++              if (len > RX_COPY_THRESHOLD)
++                      len = RX_COPY_THRESHOLD;
++              skb_put(skb, len);
++
++              if (rx->status > len) {
++                      skb_shinfo(skb)->frags[0].page_offset =
++                              rx->offset + len;
++                      skb_shinfo(skb)->frags[0].size = rx->status - len;
++                      skb->data_len = rx->status - len;
++              } else {
++                      skb_shinfo(skb)->frags[0].page = NULL;
++                      skb_shinfo(skb)->nr_frags = 0;
++              }
++
++              i = xennet_fill_frags(np, skb, &tmpq);
++
++              /*
++               * Truesize must approximates the size of true data plus
++               * any supervisor overheads. Adding hypervisor overheads
++               * has been shown to significantly reduce achievable
++               * bandwidth with the default receive buffer size. It is
++               * therefore not wise to account for it here.
++               *
++               * After alloc_skb(RX_COPY_THRESHOLD), truesize is set to
++               * RX_COPY_THRESHOLD + the supervisor overheads. Here, we
++               * add the size of the data pulled in xennet_fill_frags().
++               *
++               * We also adjust for any unused space in the main data
++               * area by subtracting (RX_COPY_THRESHOLD - len). This is
++               * especially important with drivers which split incoming
++               * packets into header and data, using only 66 bytes of
++               * the main data area (see the e1000 driver for example.)
++               * On such systems, without this last adjustement, our
++               * achievable receive throughout using the standard receive
++               * buffer size was cut by 25%(!!!).
++               */
++              skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
++              skb->len += skb->data_len;
++
++              if (rx->flags & XEN_NETRXF_csum_blank)
++                      skb->ip_summed = CHECKSUM_PARTIAL;
++              else if (rx->flags & XEN_NETRXF_data_validated)
++                      skb->ip_summed = CHECKSUM_UNNECESSARY;
++              else
++                      skb->ip_summed = CHECKSUM_NONE;
++
++              dev->stats.rx_packets++;
++              dev->stats.rx_bytes += skb->len;
++
++              __skb_queue_tail(&rxq, skb);
++
++              np->rx.rsp_cons = ++i;
++              work_done++;
++      }
++
++      if (pages_flipped) {
++              /* Some pages are no longer absent... */
++              balloon_update_driver_allowance(-pages_flipped);
++
++              /* Do all the remapping work and M2P updates. */
++              if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++                      mcl = np->rx_mcl + pages_flipped;
++                      mcl->op = __HYPERVISOR_mmu_update;
++                      mcl->args[0] = (unsigned long)np->rx_mmu;
++                      mcl->args[1] = pages_flipped;
++                      mcl->args[2] = 0;
++                      mcl->args[3] = DOMID_SELF;
++                      err = HYPERVISOR_multicall_check(np->rx_mcl,
++                                                       pages_flipped + 1,
++                                                       NULL);
++                      BUG_ON(err);
++              }
++      }
++
++      __skb_queue_purge(&errq);
++
++      while ((skb = __skb_dequeue(&rxq)) != NULL) {
++              struct page *page = NETFRONT_SKB_CB(skb)->page;
++              void *vaddr = page_address(page);
++              unsigned offset = NETFRONT_SKB_CB(skb)->offset;
++
++              memcpy(skb->data, vaddr + offset, skb_headlen(skb));
++
++              if (page != skb_shinfo(skb)->frags[0].page)
++                      __free_page(page);
++
++              /* Ethernet work: Delayed to here as it peeks the header. */
++              skb->protocol = eth_type_trans(skb, dev);
++
++              if (skb_checksum_setup(skb, &np->rx_gso_csum_fixups)) {
++                      kfree_skb(skb);
++                      continue;
++              }
++
++              /* Pass it up. */
++              netif_receive_skb(skb);
++      }
++
++      /* If we get a callback with very few responses, reduce fill target. */
++      /* NB. Note exponential increase, linear decrease. */
++      if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
++           ((3*np->rx_target) / 4)) &&
++          (--np->rx_target < np->rx_min_target))
++              np->rx_target = np->rx_min_target;
++
++      network_alloc_rx_buffers(dev);
++
++      if (work_done < budget) {
++              /* there's some spare capacity, try the accelerated path */
++              int accel_budget = budget - work_done;
++              int accel_budget_start = accel_budget;
++
++              if (np->accel_vif_state.hooks) { 
++                      accel_more_to_do =  
++                              np->accel_vif_state.hooks->netdev_poll 
++                              (dev, &accel_budget); 
++                      work_done += (accel_budget_start - accel_budget); 
++              } else
++                      accel_more_to_do = 0;
++      }
++
++      if (work_done < budget) {
++              local_irq_save(flags);
++
++              RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
++
++              if (!more_to_do && !accel_more_to_do && 
++                  np->accel_vif_state.hooks) {
++                      /* 
++                       *  Slow path has nothing more to do, see if
++                       *  fast path is likewise
++                       */
++                      accel_more_to_do = 
++                              np->accel_vif_state.hooks->start_napi_irq(dev);
++              }
++
++              if (!more_to_do && !accel_more_to_do)
++                      __napi_complete(napi);
++
++              local_irq_restore(flags);
++      }
++
++      spin_unlock(&np->rx_lock);
++      
++      return work_done;
++}
++
++static void netif_release_tx_bufs(struct netfront_info *np)
++{
++      struct sk_buff *skb;
++      int i;
++
++      for (i = 1; i <= NET_TX_RING_SIZE; i++) {
++              if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET)
++                      continue;
++
++              skb = np->tx_skbs[i];
++              gnttab_end_foreign_access_ref(np->grant_tx_ref[i]);
++              gnttab_release_grant_reference(
++                      &np->gref_tx_head, np->grant_tx_ref[i]);
++              np->grant_tx_ref[i] = GRANT_INVALID_REF;
++              add_id_to_freelist(np->tx_skbs, i);
++              dev_kfree_skb_irq(skb);
++      }
++}
++
++static void netif_release_rx_bufs_flip(struct netfront_info *np)
++{
++      struct mmu_update      *mmu = np->rx_mmu;
++      struct multicall_entry *mcl = np->rx_mcl;
++      struct sk_buff_head free_list;
++      struct sk_buff *skb;
++      unsigned long mfn;
++      int xfer = 0, noxfer = 0, unused = 0;
++      int id, ref, rc;
++
++      skb_queue_head_init(&free_list);
++
++      spin_lock_bh(&np->rx_lock);
++
++      for (id = 0; id < NET_RX_RING_SIZE; id++) {
++              if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) {
++                      unused++;
++                      continue;
++              }
++
++              skb = np->rx_skbs[id];
++              mfn = gnttab_end_foreign_transfer_ref(ref);
++              gnttab_release_grant_reference(&np->gref_rx_head, ref);
++              np->grant_rx_ref[id] = GRANT_INVALID_REF;
++              add_id_to_freelist(np->rx_skbs, id);
++
++              if (0 == mfn) {
++                      struct page *page = skb_shinfo(skb)->frags[0].page;
++                      balloon_release_driver_page(page);
++                      skb_shinfo(skb)->nr_frags = 0;
++                      dev_kfree_skb(skb);
++                      noxfer++;
++                      continue;
++              }
++
++              if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++                      /* Remap the page. */
++                      struct page *page = skb_shinfo(skb)->frags[0].page;
++                      unsigned long pfn = page_to_pfn(page);
++                      void *vaddr = page_address(page);
++
++                      MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
++                                              pfn_pte_ma(mfn, PAGE_KERNEL),
++                                              0);
++                      mcl++;
++                      mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
++                              | MMU_MACHPHYS_UPDATE;
++                      mmu->val = pfn;
++                      mmu++;
++
++                      set_phys_to_machine(pfn, mfn);
++              }
++              __skb_queue_tail(&free_list, skb);
++              xfer++;
++      }
++
++      DPRINTK("%s: %d xfer, %d noxfer, %d unused\n",
++              __FUNCTION__, xfer, noxfer, unused);
++
++      if (xfer) {
++              /* Some pages are no longer absent... */
++              balloon_update_driver_allowance(-xfer);
++
++              if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++                      /* Do all the remapping work and M2P updates. */
++                      mcl->op = __HYPERVISOR_mmu_update;
++                      mcl->args[0] = (unsigned long)np->rx_mmu;
++                      mcl->args[1] = mmu - np->rx_mmu;
++                      mcl->args[2] = 0;
++                      mcl->args[3] = DOMID_SELF;
++                      mcl++;
++                      rc = HYPERVISOR_multicall_check(
++                              np->rx_mcl, mcl - np->rx_mcl, NULL);
++                      BUG_ON(rc);
++              }
++      }
++
++      __skb_queue_purge(&free_list);
++
++      spin_unlock_bh(&np->rx_lock);
++}
++
++static void netif_release_rx_bufs_copy(struct netfront_info *np)
++{
++      struct sk_buff *skb;
++      int i, ref;
++      int busy = 0, inuse = 0;
++
++      spin_lock_bh(&np->rx_lock);
++
++      for (i = 0; i < NET_RX_RING_SIZE; i++) {
++              ref = np->grant_rx_ref[i];
++
++              if (ref == GRANT_INVALID_REF)
++                      continue;
++
++              inuse++;
++
++              skb = np->rx_skbs[i];
++
++              if (!gnttab_end_foreign_access_ref(ref))
++              {
++                      busy++;
++                      continue;
++              }
++
++              gnttab_release_grant_reference(&np->gref_rx_head, ref);
++              np->grant_rx_ref[i] = GRANT_INVALID_REF;
++              add_id_to_freelist(np->rx_skbs, i);
++
++              dev_kfree_skb(skb);
++      }
++
++      if (busy)
++              DPRINTK("%s: Unable to release %d of %d inuse grant references out of %ld total.\n",
++                      __FUNCTION__, busy, inuse, NET_RX_RING_SIZE);
++
++      spin_unlock_bh(&np->rx_lock);
++}
++
++static int network_close(struct net_device *dev)
++{
++      struct netfront_info *np = netdev_priv(dev);
++      netif_stop_queue(np->netdev);
++      napi_disable(&np->napi);
++      return 0;
++}
++
++
++static struct net_device_stats *network_get_stats(struct net_device *dev)
++{
++      netfront_accelerator_call_get_stats(dev);
++      return &dev->stats;
++}
++
++static int xennet_set_mac_address(struct net_device *dev, void *p)
++{
++      struct netfront_info *np = netdev_priv(dev);
++      struct sockaddr *addr = p;
++
++      if (netif_running(dev))
++              return -EBUSY;
++
++      if (!is_valid_ether_addr(addr->sa_data))
++              return -EADDRNOTAVAIL;
++
++      memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
++      memcpy(np->mac, addr->sa_data, ETH_ALEN);
++
++      return 0;
++}
++
++static int xennet_change_mtu(struct net_device *dev, int mtu)
++{
++      int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
++
++      if (mtu > max)
++              return -EINVAL;
++      dev->mtu = mtu;
++      return 0;
++}
++
++static int xennet_set_sg(struct net_device *dev, u32 data)
++{
++      if (data) {
++              struct netfront_info *np = netdev_priv(dev);
++              int val;
++
++              if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
++                               "%d", &val) < 0)
++                      val = 0;
++              if (!val)
++                      return -ENOSYS;
++      } else if (dev->mtu > ETH_DATA_LEN)
++              dev->mtu = ETH_DATA_LEN;
++
++      return ethtool_op_set_sg(dev, data);
++}
++
++static int xennet_set_tso(struct net_device *dev, u32 data)
++{
++      if (data) {
++              struct netfront_info *np = netdev_priv(dev);
++              int val;
++
++              if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
++                               "feature-gso-tcpv4", "%d", &val) < 0)
++                      val = 0;
++              if (!val)
++                      return -ENOSYS;
++      }
++
++      return ethtool_op_set_tso(dev, data);
++}
++
++static void xennet_set_features(struct net_device *dev)
++{
++      dev_disable_gso_features(dev);
++      xennet_set_sg(dev, 0);
++
++      /* We need checksum offload to enable scatter/gather and TSO. */
++      if (!(dev->features & NETIF_F_IP_CSUM))
++              return;
++
++      if (xennet_set_sg(dev, 1))
++              return;
++
++      /* Before 2.6.9 TSO seems to be unreliable so do not enable it
++       * on older kernels.
++       */
++      if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9))
++              xennet_set_tso(dev, 1);
++}
++
++static const struct xennet_stat {
++      char name[ETH_GSTRING_LEN];
++      u16 offset;
++} xennet_stats[] = {
++      {
++              "rx_gso_csum_fixups",
++              offsetof(struct netfront_info, rx_gso_csum_fixups) / sizeof(long)
++      },
++};
++
++static int xennet_get_sset_count(struct net_device *dev, int sset)
++{
++      switch (sset) {
++      case ETH_SS_STATS:
++              return ARRAY_SIZE(xennet_stats);
++      }
++      return -EOPNOTSUPP;
++}
++
++static void xennet_get_ethtool_stats(struct net_device *dev,
++                                   struct ethtool_stats *stats, u64 *data)
++{
++      unsigned long *np = netdev_priv(dev);
++      unsigned int i;
++
++      for (i = 0; i < ARRAY_SIZE(xennet_stats); i++)
++              data[i] = np[xennet_stats[i].offset];
++}
++
++static void xennet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
++{
++      unsigned int i;
++
++      switch (stringset) {
++      case ETH_SS_STATS:
++              for (i = 0; i < ARRAY_SIZE(xennet_stats); i++)
++                      memcpy(data + i * ETH_GSTRING_LEN,
++                             xennet_stats[i].name, ETH_GSTRING_LEN);
++              break;
++      }
++}
++
++static void netfront_get_drvinfo(struct net_device *dev,
++                               struct ethtool_drvinfo *info)
++{
++      strcpy(info->driver, "netfront");
++      strcpy(info->bus_info, dev_name(dev->dev.parent));
++}
++
++static int network_connect(struct net_device *dev)
++{
++      struct netfront_info *np = netdev_priv(dev);
++      int i, requeue_idx, err;
++      struct sk_buff *skb;
++      grant_ref_t ref;
++      netif_rx_request_t *req;
++      unsigned int feature_rx_copy, feature_rx_flip;
++
++      err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
++                         "feature-rx-copy", "%u", &feature_rx_copy);
++      if (err != 1)
++              feature_rx_copy = 0;
++      err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
++                         "feature-rx-flip", "%u", &feature_rx_flip);
++      if (err != 1)
++              feature_rx_flip = 1;
++
++      /*
++       * Copy packets on receive path if:
++       *  (a) This was requested by user, and the backend supports it; or
++       *  (b) Flipping was requested, but this is unsupported by the backend.
++       */
++      np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) ||
++                              (MODPARM_rx_flip && !feature_rx_flip));
++
++      err = talk_to_backend(np->xbdev, np);
++      if (err)
++              return err;
++
++      xennet_set_features(dev);
++
++      DPRINTK("device %s has %sing receive path.\n",
++              dev->name, np->copying_receiver ? "copy" : "flipp");
++
++      spin_lock_bh(&np->rx_lock);
++      spin_lock_irq(&np->tx_lock);
++
++      /*
++       * Recovery procedure:
++       *  NB. Freelist index entries are always going to be less than
++       *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
++       *  greater than PAGE_OFFSET: we use this property to distinguish
++       *  them.
++       */
++
++      /* Step 1: Discard all pending TX packet fragments. */
++      netif_release_tx_bufs(np);
++
++      /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
++      for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
++              if (!np->rx_skbs[i])
++                      continue;
++
++              skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
++              ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
++              req = RING_GET_REQUEST(&np->rx, requeue_idx);
++
++              if (!np->copying_receiver) {
++                      gnttab_grant_foreign_transfer_ref(
++                              ref, np->xbdev->otherend_id,
++                              page_to_pfn(skb_shinfo(skb)->frags->page));
++              } else {
++                      gnttab_grant_foreign_access_ref(
++                              ref, np->xbdev->otherend_id,
++                              pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
++                                                     frags->page)),
++                              0);
++              }
++              req->gref = ref;
++              req->id   = requeue_idx;
++
++              requeue_idx++;
++      }
++
++      np->rx.req_prod_pvt = requeue_idx;
++
++      /*
++       * Step 3: All public and private state should now be sane.  Get
++       * ready to start sending and receiving packets and give the driver
++       * domain a kick because we've probably just requeued some
++       * packets.
++       */
++      netfront_carrier_on(np);
++      notify_remote_via_irq(np->irq);
++      network_tx_buf_gc(dev);
++      network_alloc_rx_buffers(dev);
++
++      spin_unlock_irq(&np->tx_lock);
++      spin_unlock_bh(&np->rx_lock);
++
++      return 0;
++}
++
++static void netif_uninit(struct net_device *dev)
++{
++      struct netfront_info *np = netdev_priv(dev);
++      netif_release_tx_bufs(np);
++      if (np->copying_receiver)
++              netif_release_rx_bufs_copy(np);
++      else
++              netif_release_rx_bufs_flip(np);
++      gnttab_free_grant_references(np->gref_tx_head);
++      gnttab_free_grant_references(np->gref_rx_head);
++}
++
++static const struct ethtool_ops network_ethtool_ops =
++{
++      .get_drvinfo = netfront_get_drvinfo,
++      .get_tx_csum = ethtool_op_get_tx_csum,
++      .set_tx_csum = ethtool_op_set_tx_csum,
++      .get_sg = ethtool_op_get_sg,
++      .set_sg = xennet_set_sg,
++#if HAVE_TSO
++      .get_tso = ethtool_op_get_tso,
++      .set_tso = xennet_set_tso,
++#endif
++      .get_link = ethtool_op_get_link,
++
++      .get_sset_count = xennet_get_sset_count,
++      .get_ethtool_stats = xennet_get_ethtool_stats,
++      .get_strings = xennet_get_strings,
++};
++
++#ifdef CONFIG_SYSFS
++static ssize_t show_rxbuf_min(struct device *dev,
++                            struct device_attribute *attr, char *buf)
++{
++      struct netfront_info *info = netdev_priv(to_net_dev(dev));
++
++      return sprintf(buf, "%u\n", info->rx_min_target);
++}
++
++static ssize_t store_rxbuf_min(struct device *dev,
++                             struct device_attribute *attr,
++                             const char *buf, size_t len)
++{
++      struct net_device *netdev = to_net_dev(dev);
++      struct netfront_info *np = netdev_priv(netdev);
++      char *endp;
++      unsigned long target;
++
++      if (!capable(CAP_NET_ADMIN))
++              return -EPERM;
++
++      target = simple_strtoul(buf, &endp, 0);
++      if (endp == buf)
++              return -EBADMSG;
++
++      if (target < RX_MIN_TARGET)
++              target = RX_MIN_TARGET;
++      if (target > RX_MAX_TARGET)
++              target = RX_MAX_TARGET;
++
++      spin_lock_bh(&np->rx_lock);
++      if (target > np->rx_max_target)
++              np->rx_max_target = target;
++      np->rx_min_target = target;
++      if (target > np->rx_target)
++              np->rx_target = target;
++
++      network_alloc_rx_buffers(netdev);
++
++      spin_unlock_bh(&np->rx_lock);
++      return len;
++}
++
++static ssize_t show_rxbuf_max(struct device *dev,
++                            struct device_attribute *attr, char *buf)
++{
++      struct netfront_info *info = netdev_priv(to_net_dev(dev));
++
++      return sprintf(buf, "%u\n", info->rx_max_target);
++}
++
++static ssize_t store_rxbuf_max(struct device *dev,
++                             struct device_attribute *attr,
++                             const char *buf, size_t len)
++{
++      struct net_device *netdev = to_net_dev(dev);
++      struct netfront_info *np = netdev_priv(netdev);
++      char *endp;
++      unsigned long target;
++
++      if (!capable(CAP_NET_ADMIN))
++              return -EPERM;
++
++      target = simple_strtoul(buf, &endp, 0);
++      if (endp == buf)
++              return -EBADMSG;
++
++      if (target < RX_MIN_TARGET)
++              target = RX_MIN_TARGET;
++      if (target > RX_MAX_TARGET)
++              target = RX_MAX_TARGET;
++
++      spin_lock_bh(&np->rx_lock);
++      if (target < np->rx_min_target)
++              np->rx_min_target = target;
++      np->rx_max_target = target;
++      if (target < np->rx_target)
++              np->rx_target = target;
++
++      network_alloc_rx_buffers(netdev);
++
++      spin_unlock_bh(&np->rx_lock);
++      return len;
++}
++
++static ssize_t show_rxbuf_cur(struct device *dev,
++                            struct device_attribute *attr, char *buf)
++{
++      struct netfront_info *info = netdev_priv(to_net_dev(dev));
++
++      return sprintf(buf, "%u\n", info->rx_target);
++}
++
++static struct device_attribute xennet_attrs[] = {
++      __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
++      __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
++      __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
++};
++
++static int xennet_sysfs_addif(struct net_device *netdev)
++{
++      int i;
++      int error = 0;
++
++      for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
++              error = device_create_file(&netdev->dev,
++                                         &xennet_attrs[i]);
++              if (error)
++                      goto fail;
++      }
++      return 0;
++
++ fail:
++      while (--i >= 0)
++              device_remove_file(&netdev->dev, &xennet_attrs[i]);
++      return error;
++}
++
++static void xennet_sysfs_delif(struct net_device *netdev)
++{
++      int i;
++
++      for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++)
++              device_remove_file(&netdev->dev, &xennet_attrs[i]);
++}
++
++#endif /* CONFIG_SYSFS */
++
++
++/*
++ * Nothing to do here. Virtual interface is point-to-point and the
++ * physical interface is probably promiscuous anyway.
++ */
++static void network_set_multicast_list(struct net_device *dev)
++{
++}
++
++static const struct net_device_ops xennet_netdev_ops = {
++      .ndo_uninit             = netif_uninit,
++      .ndo_open               = network_open,
++      .ndo_stop               = network_close,
++      .ndo_start_xmit         = network_start_xmit,
++      .ndo_set_multicast_list = network_set_multicast_list,
++      .ndo_set_mac_address    = xennet_set_mac_address,
++      .ndo_validate_addr      = eth_validate_addr,
++      .ndo_change_mtu         = xennet_change_mtu,
++      .ndo_get_stats          = network_get_stats,
++};
++
++static struct net_device * __devinit create_netdev(struct xenbus_device *dev)
++{
++      int i, err = 0;
++      struct net_device *netdev = NULL;
++      struct netfront_info *np = NULL;
++
++      netdev = alloc_etherdev(sizeof(struct netfront_info));
++      if (!netdev) {
++              pr_warning("%s: alloc_etherdev failed\n", __FUNCTION__);
++              return ERR_PTR(-ENOMEM);
++      }
++
++      np                   = netdev_priv(netdev);
++      np->xbdev            = dev;
++
++      spin_lock_init(&np->tx_lock);
++      spin_lock_init(&np->rx_lock);
++
++      init_accelerator_vif(np, dev);
++
++      skb_queue_head_init(&np->rx_batch);
++      np->rx_target     = RX_DFL_MIN_TARGET;
++      np->rx_min_target = RX_DFL_MIN_TARGET;
++      np->rx_max_target = RX_MAX_TARGET;
++
++      init_timer(&np->rx_refill_timer);
++      np->rx_refill_timer.data = (unsigned long)netdev;
++      np->rx_refill_timer.function = rx_refill_timeout;
++
++      /* Initialise {tx,rx}_skbs as a free chain containing every entry. */
++      for (i = 0; i <= NET_TX_RING_SIZE; i++) {
++              np->tx_skbs[i] = (void *)((unsigned long) i+1);
++              np->grant_tx_ref[i] = GRANT_INVALID_REF;
++      }
++
++      for (i = 0; i < NET_RX_RING_SIZE; i++) {
++              np->rx_skbs[i] = NULL;
++              np->grant_rx_ref[i] = GRANT_INVALID_REF;
++      }
++
++      /* A grant for every tx ring slot */
++      if (gnttab_alloc_grant_references(TX_MAX_TARGET,
++                                        &np->gref_tx_head) < 0) {
++              pr_alert("#### netfront can't alloc tx grant refs\n");
++              err = -ENOMEM;
++              goto exit;
++      }
++      /* A grant for every rx ring slot */
++      if (gnttab_alloc_grant_references(RX_MAX_TARGET,
++                                        &np->gref_rx_head) < 0) {
++              pr_alert("#### netfront can't alloc rx grant refs\n");
++              err = -ENOMEM;
++              goto exit_free_tx;
++      }
++
++      netdev->netdev_ops      = &xennet_netdev_ops;
++      netif_napi_add(netdev, &np->napi, netif_poll, 64);
++      netdev->features        = NETIF_F_IP_CSUM;
++
++      SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
++      SET_NETDEV_DEV(netdev, &dev->dev);
++
++      np->netdev = netdev;
++
++      netfront_carrier_off(np);
++
++      return netdev;
++
++ exit_free_tx:
++      gnttab_free_grant_references(np->gref_tx_head);
++ exit:
++      free_netdev(netdev);
++      return ERR_PTR(err);
++}
++
++static void netif_disconnect_backend(struct netfront_info *info)
++{
++      /* Stop old i/f to prevent errors whilst we rebuild the state. */
++      spin_lock_bh(&info->rx_lock);
++      spin_lock_irq(&info->tx_lock);
++      netfront_carrier_off(info);
++      spin_unlock_irq(&info->tx_lock);
++      spin_unlock_bh(&info->rx_lock);
++
++      if (info->irq)
++              unbind_from_irqhandler(info->irq, info->netdev);
++      info->irq = 0;
++
++      end_access(info->tx_ring_ref, info->tx.sring);
++      end_access(info->rx_ring_ref, info->rx.sring);
++      info->tx_ring_ref = GRANT_INVALID_REF;
++      info->rx_ring_ref = GRANT_INVALID_REF;
++      info->tx.sring = NULL;
++      info->rx.sring = NULL;
++}
++
++
++static void end_access(int ref, void *page)
++{
++      if (ref != GRANT_INVALID_REF)
++              gnttab_end_foreign_access(ref, (unsigned long)page);
++}
++
++
++/* ** Driver registration ** */
++
++
++static const struct xenbus_device_id netfront_ids[] = {
++      { "vif" },
++      { "" }
++};
++MODULE_ALIAS("xen:vif");
++
++
++static struct xenbus_driver netfront_driver = {
++      .name = "vif",
++      .ids = netfront_ids,
++      .probe = netfront_probe,
++      .remove = __devexit_p(netfront_remove),
++      .suspend = netfront_suspend,
++      .suspend_cancel = netfront_suspend_cancel,
++      .resume = netfront_resume,
++      .otherend_changed = backend_changed,
++};
++
++
++static int __init netif_init(void)
++{
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++#ifdef CONFIG_XEN
++      if (MODPARM_rx_flip && MODPARM_rx_copy) {
++              WPRINTK("Cannot specify both rx_copy and rx_flip.\n");
++              return -EINVAL;
++      }
++
++      if (!MODPARM_rx_flip && !MODPARM_rx_copy)
++              MODPARM_rx_copy = 1; /* Default is to copy. */
++#endif
++
++      netif_init_accel();
++
++      IPRINTK("Initialising virtual ethernet driver.\n");
++
++      return xenbus_register_frontend(&netfront_driver);
++}
++module_init(netif_init);
++
++
++static void __exit netif_exit(void)
++{
++      xenbus_unregister_driver(&netfront_driver);
++
++      netif_exit_accel();
++}
++module_exit(netif_exit);
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/netfront/netfront.h

index 0000000,0000000..3fd2e97

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/netfront/netfront.h
@@@ -1,0 -1,0 +1,277 @@@
++/******************************************************************************
++ * Virtual network driver for conversing with remote driver backends.
++ *
++ * Copyright (c) 2002-2005, K A Fraser
++ * Copyright (c) 2005, XenSource Ltd
++ * Copyright (C) 2007 Solarflare Communications, Inc.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef NETFRONT_H
++#define NETFRONT_H
++
++#include <xen/interface/io/netif.h>
++#include <linux/slab.h>
++#include <linux/netdevice.h>
++#include <linux/skbuff.h>
++#include <linux/list.h>
++
++#define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE)
++#define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE)
++
++#include <xen/xenbus.h>
++
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
++/* 
++ * Function pointer table for hooks into a network acceleration
++ * plugin.  These are called at appropriate points from the netfront
++ * driver 
++ */
++struct netfront_accel_hooks {
++      /* 
++       * new_device: Accelerator hook to ask the plugin to support a
++       * new network interface
++       */
++      int (*new_device)(struct net_device *net_dev, struct xenbus_device *dev);
++      /*
++       * remove: Opposite of new_device
++       */
++      int (*remove)(struct xenbus_device *dev);
++      /*
++       * The net_device is being polled, check the accelerated
++       * hardware for any pending packets
++       */
++      int (*netdev_poll)(struct net_device *dev, int *pbudget);
++      /*
++       * start_xmit: Used to give the accelerated plugin the option
++       * of sending a packet.  Returns non-zero if has done so, or
++       * zero to decline and force the packet onto normal send
++       * path
++       */
++      int (*start_xmit)(struct sk_buff *skb, struct net_device *dev);
++      /* 
++       * start/stop_napi_interrupts Used by netfront to indicate
++       * when napi interrupts should be enabled or disabled 
++       */
++      int (*start_napi_irq)(struct net_device *dev);
++      void (*stop_napi_irq)(struct net_device *dev);
++      /* 
++       * Called before re-enabling the TX queue to check the fast
++       * path has slots too
++       */
++      int (*check_ready)(struct net_device *dev);
++      /*
++       * Get the fastpath network statistics
++       */
++      int (*get_stats)(struct net_device *dev,
++                       struct net_device_stats *stats);
++};
++
++
++/* Version of API/protocol for communication between netfront and
++   acceleration plugin supported */
++#define NETFRONT_ACCEL_VERSION 0x00010003
++
++/* 
++ * Per-netfront device state for the accelerator.  This is used to
++ * allow efficient per-netfront device access to the accelerator
++ * hooks 
++ */
++struct netfront_accel_vif_state {
++      struct list_head link;
++
++      struct xenbus_device *dev;
++      struct netfront_info *np;
++      struct netfront_accel_hooks *hooks;
++
++      /* Watch on the accelerator configuration value */
++      struct xenbus_watch accel_watch;
++      /* Work item to process change in accelerator */
++      struct work_struct accel_work;
++      /* The string from xenbus last time accel_watch fired */
++      char *accel_frontend;
++}; 
++
++/* 
++ * Per-accelerator state stored in netfront.  These form a list that
++ * is used to track which devices are accelerated by which plugins,
++ * and what plugins are available/have been requested 
++ */
++struct netfront_accelerator {
++      /* Used to make a list */
++      struct list_head link;
++      /* ID of the accelerator */
++      int id;
++      /*
++       * String describing the accelerator.  Currently this is the
++       * name of the accelerator module.  This is provided by the
++       * backend accelerator through xenstore 
++       */
++      char *frontend;
++      /* The hooks into the accelerator plugin module */
++      struct netfront_accel_hooks *hooks;
++
++      /* 
++       * List of per-netfront device state (struct
++       * netfront_accel_vif_state) for each netfront device that is
++       * using this accelerator
++       */
++      struct list_head vif_states;
++      spinlock_t vif_states_lock;
++};
++
++struct netfront_info {
++      struct list_head list;
++      struct net_device *netdev;
++
++      struct netif_tx_front_ring tx;
++      struct netif_rx_front_ring rx;
++
++      spinlock_t   tx_lock;
++      spinlock_t   rx_lock;
++
++      struct napi_struct      napi;
++
++      unsigned int irq;
++      unsigned int copying_receiver;
++      unsigned int carrier;
++
++      /* Receive-ring batched refills. */
++#define RX_MIN_TARGET 8
++#define RX_DFL_MIN_TARGET 64
++#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
++      unsigned rx_min_target, rx_max_target, rx_target;
++      struct sk_buff_head rx_batch;
++
++      struct timer_list rx_refill_timer;
++
++      /*
++       * {tx,rx}_skbs store outstanding skbuffs. The first entry in tx_skbs
++       * is an index into a chain of free entries.
++       */
++      struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
++      struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
++
++#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
++      grant_ref_t gref_tx_head;
++      grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
++      grant_ref_t gref_rx_head;
++      grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
++
++      struct xenbus_device *xbdev;
++      int tx_ring_ref;
++      int rx_ring_ref;
++      u8 mac[ETH_ALEN];
++
++      unsigned long rx_pfn_array[NET_RX_RING_SIZE];
++      struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
++      struct mmu_update rx_mmu[NET_RX_RING_SIZE];
++
++      /* Statistics */
++      unsigned long rx_gso_csum_fixups;
++
++      /* Private pointer to state internal to accelerator module */
++      void *accel_priv;
++      /* The accelerator used by this netfront device */
++      struct netfront_accelerator *accelerator;
++      /* The accelerator state for this netfront device */
++      struct netfront_accel_vif_state accel_vif_state;
++};
++
++
++/* Exported Functions */
++
++/*
++ * Called by an accelerator plugin module when it has loaded.
++ *
++ * frontend: the string describing the accelerator, currently the module name 
++ * hooks: the hooks for netfront to use to call into the accelerator
++ * version: the version of API between frontend and plugin requested
++ * 
++ * return: 0 on success, <0 on error, >0 (with version supported) on
++ * version mismatch
++ */
++extern int netfront_accelerator_loaded(int version, const char *frontend, 
++                                     struct netfront_accel_hooks *hooks);
++
++/* 
++ * Called by an accelerator plugin module when it is about to unload.
++ *
++ * frontend: the string describing the accelerator.  Must match the
++ * one passed to netfront_accelerator_loaded()
++ */ 
++extern void netfront_accelerator_stop(const char *frontend);
++
++/* 
++ * Called by an accelerator before waking the net device's TX queue to
++ * ensure the slow path has available slots.  Returns true if OK to
++ * wake, false if still busy 
++ */
++extern int netfront_check_queue_ready(struct net_device *net_dev);
++
++
++/* Internal-to-netfront Functions */
++
++/* 
++ * Call into accelerator and check to see if it has tx space before we
++ * wake the net device's TX queue.  Returns true if OK to wake, false
++ * if still busy
++ */ 
++extern 
++int netfront_check_accelerator_queue_ready(struct net_device *dev,
++                                         struct netfront_info *np);
++extern
++int netfront_accelerator_call_remove(struct netfront_info *np,
++                                   struct xenbus_device *dev);
++extern
++int netfront_accelerator_suspend(struct netfront_info *np,
++                               struct xenbus_device *dev);
++extern
++int netfront_accelerator_suspend_cancel(struct netfront_info *np,
++                                      struct xenbus_device *dev);
++extern
++void netfront_accelerator_resume(struct netfront_info *np,
++                               struct xenbus_device *dev);
++extern
++void netfront_accelerator_call_stop_napi_irq(struct netfront_info *np,
++                                           struct net_device *dev);
++extern
++int netfront_accelerator_call_get_stats(struct net_device *dev);
++extern
++void netfront_accelerator_add_watch(struct netfront_info *np);
++
++extern
++void netif_init_accel(void);
++extern
++void netif_exit_accel(void);
++
++extern
++void init_accelerator_vif(struct netfront_info *np,
++                        struct xenbus_device *dev);
++#endif /* NETFRONT_H */
diff --cc drivers/xen/pciback/Makefile

index 0000000,0000000..ee107aa

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/Makefile
@@@ -1,0 -1,0 +1,15 @@@
++obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback.o
++
++pciback-y := pci_stub.o pciback_ops.o xenbus.o
++pciback-y += conf_space.o conf_space_header.o \
++           conf_space_capability.o \
++           conf_space_capability_vpd.o \
++           conf_space_capability_pm.o \
++             conf_space_quirks.o
++pciback-$(CONFIG_PCI_MSI) += conf_space_capability_msi.o
++pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
++pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
++pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
++pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o
++
++ccflags-$(CONFIG_XEN_PCIDEV_BE_DEBUG) += -DDEBUG
diff --cc drivers/xen/pciback/conf_space.c

index 0000000,0000000..0c76db1

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/conf_space.c
@@@ -1,0 -1,0 +1,435 @@@
++/*
++ * PCI Backend - Functions for creating a virtual configuration space for
++ *               exported PCI Devices.
++ *               It's dangerous to allow PCI Driver Domains to change their
++ *               device's resources (memory, i/o ports, interrupts). We need to
++ *               restrict changes to certain PCI Configuration registers:
++ *               BARs, INTERRUPT_PIN, most registers in the header...
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_quirks.h"
++
++static int permissive;
++module_param(permissive, bool, 0644);
++
++#define DEFINE_PCI_CONFIG(op,size,type)                       \
++int pciback_##op##_config_##size                              \
++(struct pci_dev *dev, int offset, type value, void *data)     \
++{                                                             \
++      return pci_##op##_config_##size (dev, offset, value);   \
++}
++
++DEFINE_PCI_CONFIG(read, byte, u8 *)
++DEFINE_PCI_CONFIG(read, word, u16 *)
++DEFINE_PCI_CONFIG(read, dword, u32 *)
++
++DEFINE_PCI_CONFIG(write, byte, u8)
++DEFINE_PCI_CONFIG(write, word, u16)
++DEFINE_PCI_CONFIG(write, dword, u32)
++
++static int conf_space_read(struct pci_dev *dev,
++                         const struct config_field_entry *entry,
++                         int offset, u32 *value)
++{
++      int ret = 0;
++      const struct config_field *field = entry->field;
++
++      *value = 0;
++
++      switch (field->size) {
++      case 1:
++              if (field->u.b.read)
++                      ret = field->u.b.read(dev, offset, (u8 *) value,
++                                            entry->data);
++              break;
++      case 2:
++              if (field->u.w.read)
++                      ret = field->u.w.read(dev, offset, (u16 *) value,
++                                            entry->data);
++              break;
++      case 4:
++              if (field->u.dw.read)
++                      ret = field->u.dw.read(dev, offset, value, entry->data);
++              break;
++      }
++      return ret;
++}
++
++static int conf_space_write(struct pci_dev *dev,
++                          const struct config_field_entry *entry,
++                          int offset, u32 value)
++{
++      int ret = 0;
++      const struct config_field *field = entry->field;
++
++      switch (field->size) {
++      case 1:
++              if (field->u.b.write)
++                      ret = field->u.b.write(dev, offset, (u8) value,
++                                             entry->data);
++              break;
++      case 2:
++              if (field->u.w.write)
++                      ret = field->u.w.write(dev, offset, (u16) value,
++                                             entry->data);
++              break;
++      case 4:
++              if (field->u.dw.write)
++                      ret = field->u.dw.write(dev, offset, value,
++                                              entry->data);
++              break;
++      }
++      return ret;
++}
++
++static inline u32 get_mask(int size)
++{
++      if (size == 1)
++              return 0xff;
++      else if (size == 2)
++              return 0xffff;
++      else
++              return 0xffffffff;
++}
++
++static inline int valid_request(int offset, int size)
++{
++      /* Validate request (no un-aligned requests) */
++      if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
++              return 1;
++      return 0;
++}
++
++static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
++                            int offset)
++{
++      if (offset >= 0) {
++              new_val_mask <<= (offset * 8);
++              new_val <<= (offset * 8);
++      } else {
++              new_val_mask >>= (offset * -8);
++              new_val >>= (offset * -8);
++      }
++      val = (val & ~new_val_mask) | (new_val & new_val_mask);
++
++      return val;
++}
++
++static int pcibios_err_to_errno(int err)
++{
++      switch (err) {
++      case PCIBIOS_SUCCESSFUL:
++              return XEN_PCI_ERR_success;
++      case PCIBIOS_DEVICE_NOT_FOUND:
++              return XEN_PCI_ERR_dev_not_found;
++      case PCIBIOS_BAD_REGISTER_NUMBER:
++              return XEN_PCI_ERR_invalid_offset;
++      case PCIBIOS_FUNC_NOT_SUPPORTED:
++              return XEN_PCI_ERR_not_implemented;
++      case PCIBIOS_SET_FAILED:
++              return XEN_PCI_ERR_access_denied;
++      }
++      return err;
++}
++
++int pciback_config_read(struct pci_dev *dev, int offset, int size,
++                      u32 * ret_val)
++{
++      int err = 0;
++      struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++      const struct config_field_entry *cfg_entry;
++      const struct config_field *field;
++      int req_start, req_end, field_start, field_end;
++      /* if read fails for any reason, return 0 (as if device didn't respond) */
++      u32 value = 0, tmp_val;
++
++      if (unlikely(verbose_request))
++              printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
++                     pci_name(dev), size, offset);
++
++      if (!valid_request(offset, size)) {
++              err = XEN_PCI_ERR_invalid_offset;
++              goto out;
++      }
++
++      /* Get the real value first, then modify as appropriate */
++      switch (size) {
++      case 1:
++              err = pci_read_config_byte(dev, offset, (u8 *) & value);
++              break;
++      case 2:
++              err = pci_read_config_word(dev, offset, (u16 *) & value);
++              break;
++      case 4:
++              err = pci_read_config_dword(dev, offset, &value);
++              break;
++      }
++
++      list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++              field = cfg_entry->field;
++
++              req_start = offset;
++              req_end = offset + size;
++              field_start = OFFSET(cfg_entry);
++              field_end = OFFSET(cfg_entry) + field->size;
++
++              if ((req_start >= field_start && req_start < field_end)
++                  || (req_end > field_start && req_end <= field_end)) {
++                      err = conf_space_read(dev, cfg_entry, field_start,
++                                            &tmp_val);
++                      if (err)
++                              goto out;
++
++                      value = merge_value(value, tmp_val,
++                                          get_mask(field->size),
++                                          field_start - req_start);
++              }
++      }
++
++      out:
++      if (unlikely(verbose_request))
++              printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
++                     pci_name(dev), size, offset, value);
++
++      *ret_val = value;
++      return pcibios_err_to_errno(err);
++}
++
++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
++{
++      int err = 0, handled = 0;
++      struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++      const struct config_field_entry *cfg_entry;
++      const struct config_field *field;
++      u32 tmp_val;
++      int req_start, req_end, field_start, field_end;
++
++      if (unlikely(verbose_request))
++              printk(KERN_DEBUG
++                     "pciback: %s: write request %d bytes at 0x%x = %x\n",
++                     pci_name(dev), size, offset, value);
++
++      if (!valid_request(offset, size))
++              return XEN_PCI_ERR_invalid_offset;
++
++      list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++              field = cfg_entry->field;
++
++              req_start = offset;
++              req_end = offset + size;
++              field_start = OFFSET(cfg_entry);
++              field_end = OFFSET(cfg_entry) + field->size;
++
++              if ((req_start >= field_start && req_start < field_end)
++                  || (req_end > field_start && req_end <= field_end)) {
++                      tmp_val = 0;
++
++                      err = pciback_config_read(dev, field_start,
++                                                field->size, &tmp_val);
++                      if (err)
++                              break;
++
++                      tmp_val = merge_value(tmp_val, value, get_mask(size),
++                                            req_start - field_start);
++
++                      err = conf_space_write(dev, cfg_entry, field_start,
++                                             tmp_val);
++
++                      /* handled is set true here, but not every byte
++                       * may have been written! Properly detecting if
++                       * every byte is handled is unnecessary as the
++                       * flag is used to detect devices that need
++                       * special helpers to work correctly.
++                       */
++                      handled = 1;
++              }
++      }
++
++      if (!handled && !err) {
++              /* By default, anything not specificially handled above is
++               * read-only. The permissive flag changes this behavior so
++               * that anything not specifically handled above is writable.
++               * This means that some fields may still be read-only because
++               * they have entries in the config_field list that intercept
++               * the write and do nothing. */
++              if (dev_data->permissive || permissive) {
++                      switch (size) {
++                      case 1:
++                              err = pci_write_config_byte(dev, offset,
++                                                          (u8) value);
++                              break;
++                      case 2:
++                              err = pci_write_config_word(dev, offset,
++                                                          (u16) value);
++                              break;
++                      case 4:
++                              err = pci_write_config_dword(dev, offset,
++                                                           (u32) value);
++                              break;
++                      }
++              } else if (!dev_data->warned_on_write) {
++                      dev_data->warned_on_write = 1;
++                      dev_warn(&dev->dev, "Driver tried to write to a "
++                               "read-only configuration space field at offset "
++                               "0x%x, size %d. This may be harmless, but if "
++                               "you have problems with your device:\n"
++                               "1) see permissive attribute in sysfs\n"
++                               "2) report problems to the xen-devel "
++                               "mailing list along with details of your "
++                               "device obtained from lspci.\n", offset, size);
++              }
++      }
++
++      return pcibios_err_to_errno(err);
++}
++
++void pciback_config_free_dyn_fields(struct pci_dev *dev)
++{
++      struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++      struct config_field_entry *cfg_entry, *t;
++      const struct config_field *field;
++
++      dev_dbg(&dev->dev,
++              "free-ing dynamically allocated virtual configuration space fields\n");
++      if (!dev_data)
++              return;
++
++      list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
++              field = cfg_entry->field;
++
++              if (field->clean) {
++                      field->clean((struct config_field *)field);
++
++                      if (cfg_entry->data)
++                              kfree(cfg_entry->data);
++
++                      list_del(&cfg_entry->list);
++                      kfree(cfg_entry);
++              }
++
++      }
++}
++
++void pciback_config_reset_dev(struct pci_dev *dev)
++{
++      struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++      const struct config_field_entry *cfg_entry;
++      const struct config_field *field;
++
++      dev_dbg(&dev->dev, "resetting virtual configuration space\n");
++      if (!dev_data)
++              return;
++
++      list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++              field = cfg_entry->field;
++
++              if (field->reset)
++                      field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
++      }
++}
++
++void pciback_config_free_dev(struct pci_dev *dev)
++{
++      struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++      struct config_field_entry *cfg_entry, *t;
++      const struct config_field *field;
++
++      dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
++      if (!dev_data)
++              return;
++
++      list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
++              list_del(&cfg_entry->list);
++
++              field = cfg_entry->field;
++
++              if (field->release)
++                      field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
++
++              kfree(cfg_entry);
++      }
++}
++
++int pciback_config_add_field_offset(struct pci_dev *dev,
++                                  const struct config_field *field,
++                                  unsigned int base_offset)
++{
++      int err = 0;
++      struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++      struct config_field_entry *cfg_entry;
++      void *tmp;
++
++      cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
++      if (!cfg_entry) {
++              err = -ENOMEM;
++              goto out;
++      }
++
++      cfg_entry->data = NULL;
++      cfg_entry->field = field;
++      cfg_entry->base_offset = base_offset;
++
++      /* silently ignore duplicate fields */
++      err = pciback_field_is_dup(dev,OFFSET(cfg_entry));
++      if (err)
++              goto out;
++
++      if (field->init) {
++              tmp = field->init(dev, OFFSET(cfg_entry));
++
++              if (IS_ERR(tmp)) {
++                      err = PTR_ERR(tmp);
++                      goto out;
++              }
++
++              cfg_entry->data = tmp;
++      }
++
++      dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
++              OFFSET(cfg_entry));
++      list_add_tail(&cfg_entry->list, &dev_data->config_fields);
++
++      out:
++      if (err)
++              kfree(cfg_entry);
++
++      return err;
++}
++
++/* This sets up the device's virtual configuration space to keep track of 
++ * certain registers (like the base address registers (BARs) so that we can
++ * keep the client from manipulating them directly.
++ */
++int pciback_config_init_dev(struct pci_dev *dev)
++{
++      int err = 0;
++      struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++
++      dev_dbg(&dev->dev, "initializing virtual configuration space\n");
++
++      INIT_LIST_HEAD(&dev_data->config_fields);
++
++      err = pciback_config_header_add_fields(dev);
++      if (err)
++              goto out;
++
++      err = pciback_config_capability_add_fields(dev);
++      if (err)
++              goto out;
++
++      err = pciback_config_quirks_init(dev);
++
++      out:
++      return err;
++}
++
++int pciback_config_init(void)
++{
++      return pciback_config_capability_init();
++}
diff --cc drivers/xen/pciback/conf_space.h

index 0000000,0000000..fe746ef

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/conf_space.h
@@@ -1,0 -1,0 +1,126 @@@
++/*
++ * PCI Backend - Common data structures for overriding the configuration space
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#ifndef __XEN_PCIBACK_CONF_SPACE_H__
++#define __XEN_PCIBACK_CONF_SPACE_H__
++
++#include <linux/list.h>
++#include <linux/err.h>
++
++/* conf_field_init can return an errno in a ptr with ERR_PTR() */
++typedef void *(*conf_field_init) (struct pci_dev * dev, int offset);
++typedef void (*conf_field_reset) (struct pci_dev * dev, int offset, void *data);
++typedef void (*conf_field_free) (struct pci_dev * dev, int offset, void *data);
++
++typedef int (*conf_dword_write) (struct pci_dev * dev, int offset, u32 value,
++                               void *data);
++typedef int (*conf_word_write) (struct pci_dev * dev, int offset, u16 value,
++                              void *data);
++typedef int (*conf_byte_write) (struct pci_dev * dev, int offset, u8 value,
++                              void *data);
++typedef int (*conf_dword_read) (struct pci_dev * dev, int offset, u32 * value,
++                              void *data);
++typedef int (*conf_word_read) (struct pci_dev * dev, int offset, u16 * value,
++                             void *data);
++typedef int (*conf_byte_read) (struct pci_dev * dev, int offset, u8 * value,
++                             void *data);
++
++/* These are the fields within the configuration space which we
++ * are interested in intercepting reads/writes to and changing their
++ * values.
++ */
++struct config_field {
++      unsigned int offset;
++      unsigned int size;
++      unsigned int mask;
++      conf_field_init init;
++      conf_field_reset reset;
++      conf_field_free release;
++      void (*clean) (struct config_field * field);
++      union {
++              struct {
++                      conf_dword_write write;
++                      conf_dword_read read;
++              } dw;
++              struct {
++                      conf_word_write write;
++                      conf_word_read read;
++              } w;
++              struct {
++                      conf_byte_write write;
++                      conf_byte_read read;
++              } b;
++      } u;
++      struct list_head list;
++};
++
++struct config_field_entry {
++      struct list_head list;
++      const struct config_field *field;
++      unsigned int base_offset;
++      void *data;
++};
++
++#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
++
++/* Add fields to a device - the add_fields macro expects to get a pointer to
++ * the first entry in an array (of which the ending is marked by size==0)
++ */
++int pciback_config_add_field_offset(struct pci_dev *dev,
++                                  const struct config_field *field,
++                                  unsigned int offset);
++
++static inline int pciback_config_add_field(struct pci_dev *dev,
++                                         const struct config_field *field)
++{
++      return pciback_config_add_field_offset(dev, field, 0);
++}
++
++static inline int pciback_config_add_fields(struct pci_dev *dev,
++                                          const struct config_field *field)
++{
++      int i, err = 0;
++      for (i = 0; field[i].size != 0; i++) {
++              err = pciback_config_add_field(dev, &field[i]);
++              if (err)
++                      break;
++      }
++      return err;
++}
++
++static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
++                                                 const struct config_field *field,
++                                                 unsigned int offset)
++{
++      int i, err = 0;
++      for (i = 0; field[i].size != 0; i++) {
++              err = pciback_config_add_field_offset(dev, &field[i], offset);
++              if (err)
++                      break;
++      }
++      return err;
++}
++
++/* Read/Write the real configuration space */
++int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 * value,
++                           void *data);
++int pciback_read_config_word(struct pci_dev *dev, int offset, u16 * value,
++                           void *data);
++int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 * value,
++                            void *data);
++int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
++                            void *data);
++int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
++                            void *data);
++int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
++                             void *data);
++
++int pciback_config_capability_init(void);
++
++int pciback_config_header_add_fields(struct pci_dev *dev);
++int pciback_config_capability_add_fields(struct pci_dev *dev);
++
++#endif                                /* __XEN_PCIBACK_CONF_SPACE_H__ */
diff --cc drivers/xen/pciback/conf_space_capability.c

index 0000000,0000000..50efca4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability.c
@@@ -1,0 -1,0 +1,69 @@@
++/*
++ * PCI Backend - Handles the virtual fields found on the capability lists
++ *               in the configuration space.
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_capability.h"
++
++static LIST_HEAD(capabilities);
++
++static const struct config_field caplist_header[] = {
++      {
++       .offset    = PCI_CAP_LIST_ID,
++       .size      = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
++       .u.w.read  = pciback_read_config_word,
++       .u.w.write = NULL,
++      },
++      {}
++};
++
++static inline void register_capability(struct pciback_config_capability *cap)
++{
++      list_add_tail(&cap->cap_list, &capabilities);
++}
++
++int pciback_config_capability_add_fields(struct pci_dev *dev)
++{
++      int err = 0;
++      struct pciback_config_capability *cap;
++      int cap_offset;
++
++      list_for_each_entry(cap, &capabilities, cap_list) {
++              cap_offset = pci_find_capability(dev, cap->capability);
++              if (cap_offset) {
++                      dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
++                              cap->capability, cap_offset);
++
++                      err = pciback_config_add_fields_offset(dev,
++                                                             caplist_header,
++                                                             cap_offset);
++                      if (err)
++                              goto out;
++                      err = pciback_config_add_fields_offset(dev,
++                                                             cap->fields,
++                                                             cap_offset);
++                      if (err)
++                              goto out;
++              }
++      }
++
++      out:
++      return err;
++}
++
++extern struct pciback_config_capability pciback_config_capability_vpd;
++extern struct pciback_config_capability pciback_config_capability_pm;
++
++int pciback_config_capability_init(void)
++{
++      register_capability(&pciback_config_capability_vpd);
++      register_capability(&pciback_config_capability_pm);
++
++      return 0;
++}
diff --cc drivers/xen/pciback/conf_space_capability.h

index 0000000,0000000..823392e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability.h
@@@ -1,0 -1,0 +1,23 @@@
++/*
++ * PCI Backend - Data structures for special overlays for structures on
++ *               the capability list.
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
++#define __PCIBACK_CONFIG_CAPABILITY_H__
++
++#include <linux/pci.h>
++#include <linux/list.h>
++
++struct pciback_config_capability {
++      struct list_head cap_list;
++
++      int capability;
++
++      /* If the device has the capability found above, add these fields */
++      const struct config_field *fields;
++};
++
++#endif
diff --cc drivers/xen/pciback/conf_space_capability_msi.c

index 0000000,0000000..600d888

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_msi.c
@@@ -1,0 -1,0 +1,78 @@@
++/*
++ * PCI Backend -- Configuration overlay for MSI capability
++ */
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_capability.h"
++#include <xen/interface/io/pciif.h>
++
++int pciback_enable_msi(struct pciback_device *pdev,
++              struct pci_dev *dev, struct xen_pci_op *op)
++{
++      int otherend = pdev->xdev->otherend_id;
++      int status;
++
++      status = pci_enable_msi(dev);
++
++      if (status) {
++              pr_err("error enable msi for guest %x status %x\n",
++                     otherend, status);
++              op->value = 0;
++              return XEN_PCI_ERR_op_failed;
++      }
++
++      op->value = dev->irq;
++      return 0;
++}
++
++int pciback_disable_msi(struct pciback_device *pdev,
++              struct pci_dev *dev, struct xen_pci_op *op)
++{
++      pci_disable_msi(dev);
++
++      op->value = dev->irq;
++      return 0;
++}
++
++int pciback_enable_msix(struct pciback_device *pdev,
++              struct pci_dev *dev, struct xen_pci_op *op)
++{
++      int i, result;
++      struct msix_entry *entries;
++
++      if (op->value > SH_INFO_MAX_VEC)
++              return -EINVAL;
++
++      entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
++      if (entries == NULL)
++              return -ENOMEM;
++
++      for (i = 0; i < op->value; i++) {
++              entries[i].entry = op->msix_entries[i].entry;
++              entries[i].vector = op->msix_entries[i].vector;
++      }
++
++      result = pci_enable_msix(dev, entries, op->value);
++
++      for (i = 0; i < op->value; i++) {
++              op->msix_entries[i].entry = entries[i].entry;
++              op->msix_entries[i].vector = entries[i].vector;
++      }
++
++      kfree(entries);
++
++      op->value = result;
++
++      return result;
++}
++
++int pciback_disable_msix(struct pciback_device *pdev,
++              struct pci_dev *dev, struct xen_pci_op *op)
++{
++
++      pci_disable_msix(dev);
++
++      op->value = dev->irq;
++      return 0;
++}
++
diff --cc drivers/xen/pciback/conf_space_capability_pm.c

index 0000000,0000000..e2f99c7

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_pm.c
@@@ -1,0 -1,0 +1,126 @@@
++/*
++ * PCI Backend - Configuration space overlay for power management
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/pci.h>
++#include "conf_space.h"
++#include "conf_space_capability.h"
++
++static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
++                      void *data)
++{
++      int err;
++      u16 real_value;
++
++      err = pci_read_config_word(dev, offset, &real_value);
++      if (err)
++              goto out;
++
++      *value = real_value & ~PCI_PM_CAP_PME_MASK;
++
++      out:
++      return err;
++}
++
++/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
++ * Can't allow driver domain to enable PMEs - they're shared */
++#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
++
++static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
++                       void *data)
++{
++      int err;
++      u16 old_value;
++      pci_power_t new_state, old_state;
++
++      err = pci_read_config_word(dev, offset, &old_value);
++      if (err)
++              goto out;
++
++      old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK);
++      new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
++
++      new_value &= PM_OK_BITS;
++      if ((old_value & PM_OK_BITS) != new_value) {
++              new_value = (old_value & ~PM_OK_BITS) | new_value;
++              err = pci_write_config_word(dev, offset, new_value);
++              if (err)
++                      goto out;
++      }
++
++      /* Let pci core handle the power management change */
++      dev_dbg(&dev->dev, "set power state to %x\n", new_state);
++      err = pci_set_power_state(dev, new_state);
++      if (err) {
++              err = PCIBIOS_SET_FAILED;
++              goto out;
++      }
++
++      /*
++       * Device may lose PCI config info on D3->D0 transition. This
++       * is a problem for some guests which will not reset BARs. Even
++       * those that have a go will be foiled by our BAR-write handler
++       * which will discard the write! Since Linux won't re-init
++       * the config space automatically in all cases, we do it here.
++       * Future: Should we re-initialise all first 64 bytes of config space?
++       */
++      if (new_state == PCI_D0 &&
++          (old_state == PCI_D3hot || old_state == PCI_D3cold) &&
++          !(old_value & PCI_PM_CTRL_NO_SOFT_RESET))
++              pci_restore_bars(dev);
++
++ out:
++      return err;
++}
++
++/* Ensure PMEs are disabled */
++static void *pm_ctrl_init(struct pci_dev *dev, int offset)
++{
++      int err;
++      u16 value;
++
++      err = pci_read_config_word(dev, offset, &value);
++      if (err)
++              goto out;
++
++      if (value & PCI_PM_CTRL_PME_ENABLE) {
++              value &= ~PCI_PM_CTRL_PME_ENABLE;
++              err = pci_write_config_word(dev, offset, value);
++      }
++
++      out:
++      return ERR_PTR(err);
++}
++
++static const struct config_field caplist_pm[] = {
++      {
++              .offset     = PCI_PM_PMC,
++              .size       = 2,
++              .u.w.read   = pm_caps_read,
++      },
++      {
++              .offset     = PCI_PM_CTRL,
++              .size       = 2,
++              .init       = pm_ctrl_init,
++              .u.w.read   = pciback_read_config_word,
++              .u.w.write  = pm_ctrl_write,
++      },
++      {
++              .offset     = PCI_PM_PPB_EXTENSIONS,
++              .size       = 1,
++              .u.b.read   = pciback_read_config_byte,
++      },
++      {
++              .offset     = PCI_PM_DATA_REGISTER,
++              .size       = 1,
++              .u.b.read   = pciback_read_config_byte,
++      },
++      {}
++};
++
++struct pciback_config_capability pciback_config_capability_pm = {
++      .capability = PCI_CAP_ID_PM,
++      .fields = caplist_pm,
++};
diff --cc drivers/xen/pciback/conf_space_capability_vpd.c

index 0000000,0000000..920cb4a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_capability_vpd.c
@@@ -1,0 -1,0 +1,40 @@@
++/*
++ * PCI Backend - Configuration space overlay for Vital Product Data
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/pci.h>
++#include "conf_space.h"
++#include "conf_space_capability.h"
++
++static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
++                           void *data)
++{
++      /* Disallow writes to the vital product data */
++      if (value & PCI_VPD_ADDR_F)
++              return PCIBIOS_SET_FAILED;
++      else
++              return pci_write_config_word(dev, offset, value);
++}
++
++static const struct config_field caplist_vpd[] = {
++      {
++       .offset    = PCI_VPD_ADDR,
++       .size      = 2,
++       .u.w.read  = pciback_read_config_word,
++       .u.w.write = vpd_address_write,
++       },
++      {
++       .offset     = PCI_VPD_DATA,
++       .size       = 4,
++       .u.dw.read  = pciback_read_config_dword,
++       .u.dw.write = NULL,
++       },
++      {}
++};
++ 
++struct pciback_config_capability pciback_config_capability_vpd = {
++      .capability = PCI_CAP_ID_VPD,
++      .fields = caplist_vpd,
++};
diff --cc drivers/xen/pciback/conf_space_header.c

index 0000000,0000000..d7aab4e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_header.c
@@@ -1,0 -1,0 +1,378 @@@
++/*
++ * PCI Backend - Handles the virtual fields in the configuration space headers.
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++
++struct pci_bar_info {
++      u32 val;
++      u32 len_val;
++      int which;
++};
++
++#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
++#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
++
++static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
++{
++      int i;
++      int ret;
++
++      ret = pciback_read_config_word(dev, offset, value, data);
++      if (!pci_is_enabled(dev))
++              return ret;
++
++      for (i = 0; i < PCI_ROM_RESOURCE; i++) {
++              if (dev->resource[i].flags & IORESOURCE_IO)
++                      *value |= PCI_COMMAND_IO;
++              if (dev->resource[i].flags & IORESOURCE_MEM)
++                      *value |= PCI_COMMAND_MEMORY;
++      }
++
++      return ret;
++}
++
++static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
++{
++      int err;
++
++      if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
++              if (unlikely(verbose_request))
++                      printk(KERN_DEBUG "pciback: %s: enable\n",
++                             pci_name(dev));
++              err = pci_enable_device(dev);
++              if (err)
++                      return err;
++      } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
++              if (unlikely(verbose_request))
++                      printk(KERN_DEBUG "pciback: %s: disable\n",
++                             pci_name(dev));
++              pci_disable_device(dev);
++      }
++
++      if (!dev->is_busmaster && is_master_cmd(value)) {
++              if (unlikely(verbose_request))
++                      printk(KERN_DEBUG "pciback: %s: set bus master\n",
++                             pci_name(dev));
++              pci_set_master(dev);
++      }
++
++      if (value & PCI_COMMAND_INVALIDATE) {
++              if (unlikely(verbose_request))
++                      printk(KERN_DEBUG
++                             "pciback: %s: enable memory-write-invalidate\n",
++                             pci_name(dev));
++              err = pci_set_mwi(dev);
++              if (err) {
++                      pr_warning("pciback: %s: cannot enable"
++                                 " memory-write-invalidate (%d)\n",
++                                 pci_name(dev), err);
++                      value &= ~PCI_COMMAND_INVALIDATE;
++              }
++      }
++
++      return pci_write_config_word(dev, offset, value);
++}
++
++static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
++{
++      struct pci_bar_info *bar = data;
++
++      if (unlikely(!bar)) {
++              pr_warning("pciback: driver data not found for %s\n",
++                         pci_name(dev));
++              return XEN_PCI_ERR_op_failed;
++      }
++
++      /* A write to obtain the length must happen as a 32-bit write.
++       * This does not (yet) support writing individual bytes
++       */
++      if (value == ~PCI_ROM_ADDRESS_ENABLE)
++              bar->which = 1;
++      else {
++              u32 tmpval;
++              pci_read_config_dword(dev, offset, &tmpval);
++              if (tmpval != bar->val && value == bar->val) {
++                      /* Allow restoration of bar value. */
++                      pci_write_config_dword(dev, offset, bar->val);
++              }
++              bar->which = 0;
++      }
++
++      /* Do we need to support enabling/disabling the rom address here? */
++
++      return 0;
++}
++
++/* For the BARs, only allow writes which write ~0 or
++ * the correct resource information
++ * (Needed for when the driver probes the resource usage)
++ */
++static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
++{
++      struct pci_bar_info *bar = data;
++
++      if (unlikely(!bar)) {
++              pr_warning("pciback: driver data not found for %s\n",
++                         pci_name(dev));
++              return XEN_PCI_ERR_op_failed;
++      }
++
++      /* A write to obtain the length must happen as a 32-bit write.
++       * This does not (yet) support writing individual bytes
++       */
++      if (value == ~0)
++              bar->which = 1;
++      else {
++              u32 tmpval;
++              pci_read_config_dword(dev, offset, &tmpval);
++              if (tmpval != bar->val && value == bar->val) {
++                      /* Allow restoration of bar value. */
++                      pci_write_config_dword(dev, offset, bar->val);
++              }
++              bar->which = 0;
++      }
++
++      return 0;
++}
++
++static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
++{
++      struct pci_bar_info *bar = data;
++
++      if (unlikely(!bar)) {
++              pr_warning("pciback: driver data not found for %s\n",
++                         pci_name(dev));
++              return XEN_PCI_ERR_op_failed;
++      }
++
++      *value = bar->which ? bar->len_val : bar->val;
++
++      return 0;
++}
++
++static inline void read_dev_bar(struct pci_dev *dev,
++                              struct pci_bar_info *bar_info, int offset,
++                              u32 len_mask)
++{
++      int     pos;
++      struct resource *res = dev->resource;
++
++      if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1)
++              pos = PCI_ROM_RESOURCE;
++      else {
++              pos = (offset - PCI_BASE_ADDRESS_0) / 4;
++              if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE |
++                              PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
++                         (PCI_BASE_ADDRESS_SPACE_MEMORY |
++                              PCI_BASE_ADDRESS_MEM_TYPE_64))) {
++                      bar_info->val = res[pos - 1].start >> 32;
++                      bar_info->len_val = res[pos - 1].end >> 32;
++                      return;
++              }
++      }
++
++      bar_info->val = res[pos].start |
++                      (res[pos].flags & PCI_REGION_FLAG_MASK);
++      bar_info->len_val = res[pos].end - res[pos].start + 1;
++}
++
++static void *bar_init(struct pci_dev *dev, int offset)
++{
++      struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
++
++      if (!bar)
++              return ERR_PTR(-ENOMEM);
++
++      read_dev_bar(dev, bar, offset, ~0);
++      bar->which = 0;
++
++      return bar;
++}
++
++static void *rom_init(struct pci_dev *dev, int offset)
++{
++      struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
++
++      if (!bar)
++              return ERR_PTR(-ENOMEM);
++
++      read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
++      bar->which = 0;
++
++      return bar;
++}
++
++static void bar_reset(struct pci_dev *dev, int offset, void *data)
++{
++      struct pci_bar_info *bar = data;
++
++      bar->which = 0;
++}
++
++static void bar_release(struct pci_dev *dev, int offset, void *data)
++{
++      kfree(data);
++}
++
++static int pciback_read_vendor(struct pci_dev *dev, int offset,
++                             u16 *value, void *data)
++{
++      *value = dev->vendor;
++
++      return 0;
++}
++
++static int pciback_read_device(struct pci_dev *dev, int offset,
++                             u16 *value, void *data)
++{
++      *value = dev->device;
++
++      return 0;
++}
++
++static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
++                        void *data)
++{
++      *value = (u8) dev->irq;
++
++      return 0;
++}
++
++static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
++{
++      u8 cur_value;
++      int err;
++
++      err = pci_read_config_byte(dev, offset, &cur_value);
++      if (err)
++              goto out;
++
++      if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
++          || value == PCI_BIST_START)
++              err = pci_write_config_byte(dev, offset, value);
++
++      out:
++      return err;
++}
++
++static const struct config_field header_common[] = {
++      {
++       .offset    = PCI_VENDOR_ID,
++       .size      = 2,
++       .u.w.read  = pciback_read_vendor,
++      },
++      {
++       .offset    = PCI_DEVICE_ID,
++       .size      = 2,
++       .u.w.read  = pciback_read_device,
++      },
++      {
++       .offset    = PCI_COMMAND,
++       .size      = 2,
++       .u.w.read  = command_read,
++       .u.w.write = command_write,
++      },
++      {
++       .offset    = PCI_INTERRUPT_LINE,
++       .size      = 1,
++       .u.b.read  = interrupt_read,
++      },
++      {
++       .offset    = PCI_INTERRUPT_PIN,
++       .size      = 1,
++       .u.b.read  = pciback_read_config_byte,
++      },
++      {
++       /* Any side effects of letting driver domain control cache line? */
++       .offset    = PCI_CACHE_LINE_SIZE,
++       .size      = 1,
++       .u.b.read  = pciback_read_config_byte,
++       .u.b.write = pciback_write_config_byte,
++      },
++      {
++       .offset    = PCI_LATENCY_TIMER,
++       .size      = 1,
++       .u.b.read  = pciback_read_config_byte,
++      },
++      {
++       .offset    = PCI_BIST,
++       .size      = 1,
++       .u.b.read  = pciback_read_config_byte,
++       .u.b.write = bist_write,
++      },
++      {}
++};
++
++#define CFG_FIELD_BAR(reg_offset)                     \
++      {                                               \
++       .offset     = reg_offset,                      \
++       .size       = 4,                               \
++       .init       = bar_init,                        \
++       .reset      = bar_reset,                       \
++       .release    = bar_release,                     \
++       .u.dw.read  = bar_read,                        \
++       .u.dw.write = bar_write,                       \
++       }
++
++#define CFG_FIELD_ROM(reg_offset)                     \
++      {                                               \
++       .offset     = reg_offset,                      \
++       .size       = 4,                               \
++       .init       = rom_init,                        \
++       .reset      = bar_reset,                       \
++       .release    = bar_release,                     \
++       .u.dw.read  = bar_read,                        \
++       .u.dw.write = rom_write,                       \
++       }
++
++static const struct config_field header_0[] = {
++      CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
++      CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
++      CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
++      CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
++      CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
++      CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
++      CFG_FIELD_ROM(PCI_ROM_ADDRESS),
++      {}
++};
++
++static const struct config_field header_1[] = {
++      CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
++      CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
++      CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
++      {}
++};
++
++int pciback_config_header_add_fields(struct pci_dev *dev)
++{
++      int err;
++
++      err = pciback_config_add_fields(dev, header_common);
++      if (err)
++              goto out;
++
++      switch (dev->hdr_type) {
++      case PCI_HEADER_TYPE_NORMAL:
++              err = pciback_config_add_fields(dev, header_0);
++              break;
++
++      case PCI_HEADER_TYPE_BRIDGE:
++              err = pciback_config_add_fields(dev, header_1);
++              break;
++
++      default:
++              err = -EINVAL;
++              pr_err("pciback: %s: Unsupported header type %d!\n",
++                     pci_name(dev), dev->hdr_type);
++              break;
++      }
++
++      out:
++      return err;
++}
diff --cc drivers/xen/pciback/conf_space_quirks.c

index 0000000,0000000..244a438

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_quirks.c
@@@ -1,0 -1,0 +1,138 @@@
++/*
++ * PCI Backend - Handle special overlays for broken devices.
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
++ */
++
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_quirks.h"
++
++LIST_HEAD(pciback_quirks);
++
++static inline const struct pci_device_id *
++match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
++{
++      if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
++          (id->device == PCI_ANY_ID || id->device == dev->device) &&
++          (id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) &&
++          (id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) &&
++          !((id->class ^ dev->class) & id->class_mask))
++              return id;
++      return NULL;
++}
++
++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
++{
++      struct pciback_config_quirk *tmp_quirk;
++
++      list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
++              if (match_one_device(&tmp_quirk->devid, dev) != NULL)
++                      goto out;
++      tmp_quirk = NULL;
++      printk(KERN_DEBUG
++             "quirk didn't match any device pciback knows about\n");
++      out:
++      return tmp_quirk;
++}
++
++static inline void register_quirk(struct pciback_config_quirk *quirk)
++{
++      list_add_tail(&quirk->quirks_list, &pciback_quirks);
++}
++
++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg)
++{
++      int ret = 0;
++      struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++      struct config_field_entry *cfg_entry;
++
++      list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++              if ( OFFSET(cfg_entry) == reg) {
++                      ret = 1;
++                      break;
++              }
++      }
++      return ret;
++}
++
++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
++                                  *field)
++{
++      int err = 0;
++
++      switch (field->size) {
++      case 1:
++              field->u.b.read = pciback_read_config_byte;
++              field->u.b.write = pciback_write_config_byte;
++              break;
++      case 2:
++              field->u.w.read = pciback_read_config_word;
++              field->u.w.write = pciback_write_config_word;
++              break;
++      case 4:
++              field->u.dw.read = pciback_read_config_dword;
++              field->u.dw.write = pciback_write_config_dword;
++              break;
++      default:
++              err = -EINVAL;
++              goto out;
++      }
++
++      pciback_config_add_field(dev, field);
++
++      out:
++      return err;
++}
++
++int pciback_config_quirks_init(struct pci_dev *dev)
++{
++      struct pciback_config_quirk *quirk;
++      int ret = 0;
++
++      quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
++      if (!quirk) {
++              ret = -ENOMEM;
++              goto out;
++      }
++
++      quirk->devid.vendor = dev->vendor;
++      quirk->devid.device = dev->device;
++      quirk->devid.subvendor = dev->subsystem_vendor;
++      quirk->devid.subdevice = dev->subsystem_device;
++      quirk->devid.class = 0;
++      quirk->devid.class_mask = 0;
++      quirk->devid.driver_data = 0UL;
++
++      quirk->pdev = dev;
++
++      register_quirk(quirk);
++      out:
++      return ret;
++}
++
++void pciback_config_field_free(struct config_field *field)
++{
++      kfree(field);
++}
++
++int pciback_config_quirk_release(struct pci_dev *dev)
++{
++      struct pciback_config_quirk *quirk;
++      int ret = 0;
++
++      quirk = pciback_find_quirk(dev);
++      if (!quirk) {
++              ret = -ENXIO;
++              goto out;
++      }
++
++      list_del(&quirk->quirks_list);
++      kfree(quirk);
++
++      out:
++      return ret;
++}
diff --cc drivers/xen/pciback/conf_space_quirks.h

index 0000000,0000000..acd0e1a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/conf_space_quirks.h
@@@ -1,0 -1,0 +1,35 @@@
++/*
++ * PCI Backend - Data structures for special overlays for broken devices.
++ *
++ * Ryan Wilson <hap9@epoch.ncsc.mil>
++ * Chris Bookholt <hap10@epoch.ncsc.mil>
++ */
++
++#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
++#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
++
++#include <linux/pci.h>
++#include <linux/list.h>
++
++struct pciback_config_quirk {
++      struct list_head quirks_list;
++      struct pci_device_id devid;
++      struct pci_dev *pdev;
++};
++
++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
++
++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
++                                  *field);
++
++int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
++
++int pciback_config_quirks_init(struct pci_dev *dev);
++
++void pciback_config_field_free(struct config_field *field);
++
++int pciback_config_quirk_release(struct pci_dev *dev);
++
++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg);
++
++#endif
diff --cc drivers/xen/pciback/controller.c

index 0000000,0000000..294e48f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/controller.c
@@@ -1,0 -1,0 +1,443 @@@
++/*
++ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
++ *      Alex Williamson <alex.williamson@hp.com>
++ *
++ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI
++ * controllers.  Devices under the same PCI controller are exposed on the
++ * same virtual domain:bus.  Within a bus, device slots are virtualized
++ * to compact the bus.
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ */
++
++#include <linux/acpi.h>
++#include <linux/list.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
++
++#define PCI_MAX_BUSSES        255
++#define PCI_MAX_SLOTS 32
++
++struct controller_dev_entry {
++      struct list_head list;
++      struct pci_dev *dev;
++      unsigned int devfn;
++};
++
++struct controller_list_entry {
++      struct list_head list;
++      struct pci_controller *controller;
++      unsigned int domain;
++      unsigned int bus;
++      unsigned int next_devfn;
++      struct list_head dev_list;
++};
++
++struct controller_dev_data {
++      struct list_head list;
++      unsigned int next_domain;
++      unsigned int next_bus;
++      spinlock_t lock;
++};
++
++struct walk_info {
++      struct pciback_device *pdev;
++      int resource_count;
++      int root_num;
++};
++
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++                                  unsigned int domain, unsigned int bus,
++                                  unsigned int devfn)
++{
++      struct controller_dev_data *dev_data = pdev->pci_dev_data;
++      struct controller_dev_entry *dev_entry;
++      struct controller_list_entry *cntrl_entry;
++      struct pci_dev *dev = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&dev_data->lock, flags);
++
++      list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++              if (cntrl_entry->domain != domain ||
++                  cntrl_entry->bus != bus)
++                      continue;
++
++              list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++                      if (devfn == dev_entry->devfn) {
++                              dev = dev_entry->dev;
++                              goto found;
++                      }
++              }
++      }
++found:
++      spin_unlock_irqrestore(&dev_data->lock, flags);
++
++      return dev;
++}
++
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++                      int devid, publish_pci_dev_cb publish_cb)
++{
++      struct controller_dev_data *dev_data = pdev->pci_dev_data;
++      struct controller_dev_entry *dev_entry;
++      struct controller_list_entry *cntrl_entry;
++      struct pci_controller *dev_controller = PCI_CONTROLLER(dev);
++      unsigned long flags;
++      int ret = 0, found = 0;
++
++      spin_lock_irqsave(&dev_data->lock, flags);
++
++      /* Look to see if we already have a domain:bus for this controller */
++      list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++              if (cntrl_entry->controller == dev_controller) {
++                      found = 1;
++                      break;
++              }
++      }
++
++      if (!found) {
++              cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC);
++              if (!cntrl_entry) {
++                      ret =  -ENOMEM;
++                      goto out;
++              }
++
++              cntrl_entry->controller = dev_controller;
++              cntrl_entry->next_devfn = PCI_DEVFN(0, 0);
++
++              cntrl_entry->domain = dev_data->next_domain;
++              cntrl_entry->bus = dev_data->next_bus++;
++              if (dev_data->next_bus > PCI_MAX_BUSSES) {
++                      dev_data->next_domain++;
++                      dev_data->next_bus = 0;
++              }
++
++              INIT_LIST_HEAD(&cntrl_entry->dev_list);
++
++              list_add_tail(&cntrl_entry->list, &dev_data->list);
++      }
++
++      if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) {
++              /*
++               * While it seems unlikely, this can actually happen if
++               * a controller has P2P bridges under it.
++               */
++              xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x "
++                               "is full, no room to export %04x:%02x:%02x.%x",
++                               cntrl_entry->domain, cntrl_entry->bus,
++                               pci_domain_nr(dev->bus), dev->bus->number,
++                               PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
++              ret = -ENOSPC;
++              goto out;
++      }
++
++      dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC);
++      if (!dev_entry) {
++              if (list_empty(&cntrl_entry->dev_list)) {
++                      list_del(&cntrl_entry->list);
++                      kfree(cntrl_entry);
++              }
++              ret = -ENOMEM;
++              goto out;
++      }
++
++      dev_entry->dev = dev;
++      dev_entry->devfn = cntrl_entry->next_devfn;
++
++      list_add_tail(&dev_entry->list, &cntrl_entry->dev_list);
++
++      cntrl_entry->next_devfn += PCI_DEVFN(1, 0);
++
++out:
++      spin_unlock_irqrestore(&dev_data->lock, flags);
++
++      /* TODO: Publish virtual domain:bus:slot.func here. */
++
++      return ret;
++}
++
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++{
++      struct controller_dev_data *dev_data = pdev->pci_dev_data;
++      struct controller_list_entry *cntrl_entry;
++      struct controller_dev_entry *dev_entry = NULL;
++      struct pci_dev *found_dev = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&dev_data->lock, flags);
++
++      list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++              if (cntrl_entry->controller != PCI_CONTROLLER(dev))
++                      continue;
++
++              list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++                      if (dev_entry->dev == dev) {
++                              found_dev = dev_entry->dev;
++                              break;
++                      }
++              }
++      }
++
++      if (!found_dev) {
++              spin_unlock_irqrestore(&dev_data->lock, flags);
++              return;
++      }
++
++      list_del(&dev_entry->list);
++      kfree(dev_entry);
++
++      if (list_empty(&cntrl_entry->dev_list)) {
++              list_del(&cntrl_entry->list);
++              kfree(cntrl_entry);
++      }
++
++      spin_unlock_irqrestore(&dev_data->lock, flags);
++      pcistub_put_pci_dev(found_dev);
++}
++
++int pciback_init_devices(struct pciback_device *pdev)
++{
++      struct controller_dev_data *dev_data;
++
++      dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
++      if (!dev_data)
++              return -ENOMEM;
++
++      spin_lock_init(&dev_data->lock);
++
++      INIT_LIST_HEAD(&dev_data->list);
++
++      /* Starting domain:bus numbers */
++      dev_data->next_domain = 0;
++      dev_data->next_bus = 0;
++
++      pdev->pci_dev_data = dev_data;
++
++      return 0;
++}
++
++static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data)
++{
++      struct walk_info *info = data;
++      struct acpi_resource_address64 addr;
++      acpi_status status;
++      int i, len, err;
++      char str[32], tmp[3];
++      unsigned char *ptr, *buf;
++
++      status = acpi_resource_to_address64(res, &addr);
++
++      /* Do we care about this range?  Let's check. */
++      if (!ACPI_SUCCESS(status) ||
++          !(addr.resource_type == ACPI_MEMORY_RANGE ||
++            addr.resource_type == ACPI_IO_RANGE) ||
++          !addr.address_length || addr.producer_consumer != ACPI_PRODUCER)
++              return AE_OK;
++
++      /*
++       * Furthermore, we really only care to tell the guest about
++       * address ranges that require address translation of some sort.
++       */
++      if (!(addr.resource_type == ACPI_MEMORY_RANGE &&
++            addr.info.mem.translation) &&
++          !(addr.resource_type == ACPI_IO_RANGE &&
++            addr.info.io.translation))
++              return AE_OK;
++         
++      /* Store the resource in xenbus for the guest */
++      len = snprintf(str, sizeof(str), "root-%d-resource-%d",
++                     info->root_num, info->resource_count);
++      if (unlikely(len >= (sizeof(str) - 1)))
++              return AE_OK;
++
++      buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL);
++      if (!buf)
++              return AE_OK;
++
++      /* Clean out resource_source */
++      res->data.address64.resource_source.index = 0xFF;
++      res->data.address64.resource_source.string_length = 0;
++      res->data.address64.resource_source.string_ptr = NULL;
++
++      ptr = (unsigned char *)res;
++
++      /* Turn the acpi_resource into an ASCII byte stream */
++      for (i = 0; i < sizeof(*res); i++) {
++              snprintf(tmp, sizeof(tmp), "%02x", ptr[i]);
++              strncat(buf, tmp, 2);
++      }
++
++      err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename,
++                          str, "%s", buf);
++
++      if (!err)
++              info->resource_count++;
++
++      kfree(buf);
++
++      return AE_OK;
++}
++
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++                            publish_pci_root_cb publish_root_cb)
++{
++      struct controller_dev_data *dev_data = pdev->pci_dev_data;
++      struct controller_list_entry *cntrl_entry;
++      int i, root_num, len, err = 0;
++      unsigned int domain, bus;
++      char str[64];
++      struct walk_info info;
++
++      spin_lock(&dev_data->lock);
++
++      list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++              /* First publish all the domain:bus info */
++              err = publish_root_cb(pdev, cntrl_entry->domain,
++                                    cntrl_entry->bus);
++              if (err)
++                      goto out;
++
++              /*
++               * Now figure out which root-%d this belongs to
++               * so we can associate resources with it.
++               */
++              err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++                                 "root_num", "%d", &root_num);
++
++              if (err != 1)
++                      goto out;
++
++              for (i = 0; i < root_num; i++) {
++                      len = snprintf(str, sizeof(str), "root-%d", i);
++                      if (unlikely(len >= (sizeof(str) - 1))) {
++                              err = -ENOMEM;
++                              goto out;
++                      }
++
++                      err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++                                         str, "%x:%x", &domain, &bus);
++                      if (err != 2)
++                              goto out;
++
++                      /* Is this the one we just published? */
++                      if (domain == cntrl_entry->domain &&
++                          bus == cntrl_entry->bus)
++                              break;
++              }
++
++              if (i == root_num)
++                      goto out;
++
++              info.pdev = pdev;
++              info.resource_count = 0;
++              info.root_num = i;
++
++              /* Let ACPI do the heavy lifting on decoding resources */
++              acpi_walk_resources(cntrl_entry->controller->acpi_handle,
++                                  METHOD_NAME__CRS, write_xenbus_resource,
++                                  &info);
++
++              /* No resouces.  OK.  On to the next one */
++              if (!info.resource_count)
++                      continue;
++
++              /* Store the number of resources we wrote for this root-%d */
++              len = snprintf(str, sizeof(str), "root-%d-resources", i);
++              if (unlikely(len >= (sizeof(str) - 1))) {
++                      err = -ENOMEM;
++                      goto out;
++              }
++
++              err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++                                  "%d", info.resource_count);
++              if (err)
++                      goto out;
++      }
++
++      /* Finally, write some magic to synchronize with the guest. */
++      len = snprintf(str, sizeof(str), "root-resource-magic");
++      if (unlikely(len >= (sizeof(str) - 1))) {
++              err = -ENOMEM;
++              goto out;
++      }
++
++      err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++                          "%lx", (sizeof(struct acpi_resource) * 2) + 1);
++
++out:
++      spin_unlock(&dev_data->lock);
++
++      return err;
++}
++
++void pciback_release_devices(struct pciback_device *pdev)
++{
++      struct controller_dev_data *dev_data = pdev->pci_dev_data;
++      struct controller_list_entry *cntrl_entry, *c;
++      struct controller_dev_entry *dev_entry, *d;
++
++      list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) {
++              list_for_each_entry_safe(dev_entry, d,
++                                       &cntrl_entry->dev_list, list) {
++                      list_del(&dev_entry->list);
++                      pcistub_put_pci_dev(dev_entry->dev);
++                      kfree(dev_entry);
++              }
++              list_del(&cntrl_entry->list);
++              kfree(cntrl_entry);
++      }
++
++      kfree(dev_data);
++      pdev->pci_dev_data = NULL;
++}
++
++int pciback_get_pcifront_dev(struct pci_dev *pcidev, 
++              struct pciback_device *pdev, 
++              unsigned int *domain, unsigned int *bus, unsigned int *devfn)
++{
++      struct controller_dev_data *dev_data = pdev->pci_dev_data;
++      struct controller_dev_entry *dev_entry;
++      struct controller_list_entry *cntrl_entry;
++      unsigned long flags;
++      int found = 0;
++      spin_lock_irqsave(&dev_data->lock, flags);
++
++      list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++              list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++                      if ( (dev_entry->dev->bus->number == 
++                                      pcidev->bus->number) &&
++                              (dev_entry->dev->devfn ==
++                                      pcidev->devfn) &&
++                              (pci_domain_nr(dev_entry->dev->bus) ==
++                                      pci_domain_nr(pcidev->bus)))
++                      {
++                              found = 1;
++                              *domain = cntrl_entry->domain;
++                              *bus = cntrl_entry->bus;
++                              *devfn = dev_entry->devfn;
++                              goto out;
++                      }
++              }
++      }
++out:
++      spin_unlock_irqrestore(&dev_data->lock, flags);
++      return found;
++
++}
++
diff --cc drivers/xen/pciback/passthrough.c

index 0000000,0000000..9e7a0c4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/passthrough.c
@@@ -1,0 -1,0 +1,176 @@@
++/*
++ * PCI Backend - Provides restricted access to the real PCI bus topology
++ *               to the frontend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/list.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
++
++struct passthrough_dev_data {
++      /* Access to dev_list must be protected by lock */
++      struct list_head dev_list;
++      spinlock_t lock;
++};
++
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++                                  unsigned int domain, unsigned int bus,
++                                  unsigned int devfn)
++{
++      struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++      struct pci_dev_entry *dev_entry;
++      struct pci_dev *dev = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&dev_data->lock, flags);
++
++      list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
++              if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
++                  && bus == (unsigned int)dev_entry->dev->bus->number
++                  && devfn == dev_entry->dev->devfn) {
++                      dev = dev_entry->dev;
++                      break;
++              }
++      }
++
++      spin_unlock_irqrestore(&dev_data->lock, flags);
++
++      return dev;
++}
++
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++                      int devid, publish_pci_dev_cb publish_cb)
++{
++      struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++      struct pci_dev_entry *dev_entry;
++      unsigned long flags;
++      unsigned int domain, bus, devfn;
++      int err;
++
++      dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
++      if (!dev_entry)
++              return -ENOMEM;
++      dev_entry->dev = dev;
++
++      spin_lock_irqsave(&dev_data->lock, flags);
++      list_add_tail(&dev_entry->list, &dev_data->dev_list);
++      spin_unlock_irqrestore(&dev_data->lock, flags);
++
++      /* Publish this device. */
++      domain = (unsigned int)pci_domain_nr(dev->bus);
++      bus = (unsigned int)dev->bus->number;
++      devfn = dev->devfn;
++      err = publish_cb(pdev, domain, bus, devfn, devid);
++
++      return err;
++}
++
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++{
++      struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++      struct pci_dev_entry *dev_entry, *t;
++      struct pci_dev *found_dev = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&dev_data->lock, flags);
++
++      list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
++              if (dev_entry->dev == dev) {
++                      list_del(&dev_entry->list);
++                      found_dev = dev_entry->dev;
++                      kfree(dev_entry);
++              }
++      }
++
++      spin_unlock_irqrestore(&dev_data->lock, flags);
++
++      if (found_dev)
++              pcistub_put_pci_dev(found_dev);
++}
++
++int pciback_init_devices(struct pciback_device *pdev)
++{
++      struct passthrough_dev_data *dev_data;
++
++      dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
++      if (!dev_data)
++              return -ENOMEM;
++
++      spin_lock_init(&dev_data->lock);
++
++      INIT_LIST_HEAD(&dev_data->dev_list);
++
++      pdev->pci_dev_data = dev_data;
++
++      return 0;
++}
++
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++                            publish_pci_root_cb publish_root_cb)
++{
++      int err = 0;
++      struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++      struct pci_dev_entry *dev_entry, *e;
++      struct pci_dev *dev;
++      int found;
++      unsigned int domain, bus;
++
++      spin_lock(&dev_data->lock);
++
++      list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
++              /* Only publish this device as a root if none of its
++               * parent bridges are exported
++               */
++              found = 0;
++              dev = dev_entry->dev->bus->self;
++              for (; !found && dev != NULL; dev = dev->bus->self) {
++                      list_for_each_entry(e, &dev_data->dev_list, list) {
++                              if (dev == e->dev) {
++                                      found = 1;
++                                      break;
++                              }
++                      }
++              }
++
++              domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
++              bus = (unsigned int)dev_entry->dev->bus->number;
++
++              if (!found) {
++                      err = publish_root_cb(pdev, domain, bus);
++                      if (err)
++                              break;
++              }
++      }
++
++      spin_unlock(&dev_data->lock);
++
++      return err;
++}
++
++void pciback_release_devices(struct pciback_device *pdev)
++{
++      struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++      struct pci_dev_entry *dev_entry, *t;
++
++      list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
++              list_del(&dev_entry->list);
++              pcistub_put_pci_dev(dev_entry->dev);
++              kfree(dev_entry);
++      }
++
++      kfree(dev_data);
++      pdev->pci_dev_data = NULL;
++}
++
++int pciback_get_pcifront_dev(struct pci_dev *pcidev, struct pciback_device *pdev, 
++              unsigned int *domain, unsigned int *bus, unsigned int *devfn)
++
++{
++      *domain = pci_domain_nr(pcidev->bus);
++      *bus = pcidev->bus->number;
++      *devfn = pcidev->devfn;
++      return 1;
++}
diff --cc drivers/xen/pciback/pci_stub.c

index 0000000,0000000..aa5ea31

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/pci_stub.c
@@@ -1,0 -1,0 +1,1325 @@@
++/*
++ * PCI Stub Driver - Grabs devices in backend to be exported later
++ *
++ * Ryan Wilson <hap9@epoch.ncsc.mil>
++ * Chris Bookholt <hap10@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/gfp.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/rwsem.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/kref.h>
++#include <linux/pci.h>
++#include <linux/wait.h>
++#include <asm/atomic.h>
++#include <xen/evtchn.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_quirks.h"
++
++static char *pci_devs_to_hide = NULL;
++wait_queue_head_t aer_wait_queue;
++/*Add sem for sync AER handling and pciback remove/reconfigue ops,
++* We want to avoid in middle of AER ops, pciback devices is being removed
++*/
++static DECLARE_RWSEM(pcistub_sem);
++module_param_named(hide, pci_devs_to_hide, charp, 0444);
++
++struct pcistub_device_id {
++      struct list_head slot_list;
++      int domain;
++      unsigned char bus;
++      unsigned int devfn;
++};
++static LIST_HEAD(pcistub_device_ids);
++static DEFINE_SPINLOCK(device_ids_lock);
++
++struct pcistub_device {
++      struct kref kref;
++      struct list_head dev_list;
++      spinlock_t lock;
++
++      struct pci_dev *dev;
++      struct pciback_device *pdev;    /* non-NULL if struct pci_dev is in use */
++};
++
++/* Access to pcistub_devices & seized_devices lists and the initialize_devices
++ * flag must be locked with pcistub_devices_lock
++ */
++static DEFINE_SPINLOCK(pcistub_devices_lock);
++static LIST_HEAD(pcistub_devices);
++
++/* wait for device_initcall before initializing our devices
++ * (see pcistub_init_devices_late)
++ */
++static int initialize_devices = 0;
++static LIST_HEAD(seized_devices);
++
++static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
++{
++      struct pcistub_device *psdev;
++
++      dev_dbg(&dev->dev, "pcistub_device_alloc\n");
++
++      psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
++      if (!psdev)
++              return NULL;
++
++      psdev->dev = pci_dev_get(dev);
++      if (!psdev->dev) {
++              kfree(psdev);
++              return NULL;
++      }
++
++      kref_init(&psdev->kref);
++      spin_lock_init(&psdev->lock);
++
++      return psdev;
++}
++
++/* Don't call this directly as it's called by pcistub_device_put */
++static void pcistub_device_release(struct kref *kref)
++{
++      struct pcistub_device *psdev;
++
++      psdev = container_of(kref, struct pcistub_device, kref);
++
++      dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
++
++      /* Clean-up the device */
++      pciback_reset_device(psdev->dev);
++      pciback_config_free_dyn_fields(psdev->dev);
++      pciback_config_free_dev(psdev->dev);
++      kfree(pci_get_drvdata(psdev->dev));
++      pci_set_drvdata(psdev->dev, NULL);
++
++      pci_dev_put(psdev->dev);
++
++      kfree(psdev);
++}
++
++static inline void pcistub_device_get(struct pcistub_device *psdev)
++{
++      kref_get(&psdev->kref);
++}
++
++static inline void pcistub_device_put(struct pcistub_device *psdev)
++{
++      kref_put(&psdev->kref, pcistub_device_release);
++}
++
++static struct pcistub_device *pcistub_device_find(int domain, int bus,
++                                                int slot, int func)
++{
++      struct pcistub_device *psdev = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++      list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++              if (psdev->dev != NULL
++                  && domain == pci_domain_nr(psdev->dev->bus)
++                  && bus == psdev->dev->bus->number
++                  && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
++                      pcistub_device_get(psdev);
++                      goto out;
++              }
++      }
++
++      /* didn't find it */
++      psdev = NULL;
++
++      out:
++      spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++      return psdev;
++}
++
++static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
++                                                struct pcistub_device *psdev)
++{
++      struct pci_dev *pci_dev = NULL;
++      unsigned long flags;
++
++      pcistub_device_get(psdev);
++
++      spin_lock_irqsave(&psdev->lock, flags);
++      if (!psdev->pdev) {
++              psdev->pdev = pdev;
++              pci_dev = psdev->dev;
++      }
++      spin_unlock_irqrestore(&psdev->lock, flags);
++
++      if (!pci_dev)
++              pcistub_device_put(psdev);
++
++      return pci_dev;
++}
++
++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
++                                          int domain, int bus,
++                                          int slot, int func)
++{
++      struct pcistub_device *psdev;
++      struct pci_dev *found_dev = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++      list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++              if (psdev->dev != NULL
++                  && domain == pci_domain_nr(psdev->dev->bus)
++                  && bus == psdev->dev->bus->number
++                  && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
++                      found_dev = pcistub_device_get_pci_dev(pdev, psdev);
++                      break;
++              }
++      }
++
++      spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++      return found_dev;
++}
++
++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
++                                  struct pci_dev *dev)
++{
++      struct pcistub_device *psdev;
++      struct pci_dev *found_dev = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++      list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++              if (psdev->dev == dev) {
++                      found_dev = pcistub_device_get_pci_dev(pdev, psdev);
++                      break;
++              }
++      }
++
++      spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++      return found_dev;
++}
++
++void pcistub_put_pci_dev(struct pci_dev *dev)
++{
++      struct pcistub_device *psdev, *found_psdev = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++      list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++              if (psdev->dev == dev) {
++                      found_psdev = psdev;
++                      break;
++              }
++      }
++
++      spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++      /*hold this lock for avoiding breaking link between
++      * pcistub and pciback when AER is in processing
++      */
++      down_write(&pcistub_sem);
++      /* Cleanup our device
++       * (so it's ready for the next domain)
++       */
++      pciback_reset_device(found_psdev->dev);
++      pciback_config_free_dyn_fields(found_psdev->dev);
++      pciback_config_reset_dev(found_psdev->dev);
++
++      spin_lock_irqsave(&found_psdev->lock, flags);
++      found_psdev->pdev = NULL;
++      spin_unlock_irqrestore(&found_psdev->lock, flags);
++
++      pcistub_device_put(found_psdev);
++      up_write(&pcistub_sem);
++}
++
++static int __devinit pcistub_match_one(struct pci_dev *dev,
++                                     struct pcistub_device_id *pdev_id)
++{
++      /* Match the specified device by domain, bus, slot, func and also if
++       * any of the device's parent bridges match.
++       */
++      for (; dev != NULL; dev = dev->bus->self) {
++              if (pci_domain_nr(dev->bus) == pdev_id->domain
++                  && dev->bus->number == pdev_id->bus
++                  && dev->devfn == pdev_id->devfn)
++                      return 1;
++
++              /* Sometimes topmost bridge links to itself. */
++              if (dev == dev->bus->self)
++                      break;
++      }
++
++      return 0;
++}
++
++static int __devinit pcistub_match(struct pci_dev *dev)
++{
++      struct pcistub_device_id *pdev_id;
++      unsigned long flags;
++      int found = 0;
++
++      spin_lock_irqsave(&device_ids_lock, flags);
++      list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
++              if (pcistub_match_one(dev, pdev_id)) {
++                      found = 1;
++                      break;
++              }
++      }
++      spin_unlock_irqrestore(&device_ids_lock, flags);
++
++      return found;
++}
++
++static int __devinit pcistub_init_device(struct pci_dev *dev)
++{
++      struct pciback_dev_data *dev_data;
++      int err = 0;
++
++      dev_dbg(&dev->dev, "initializing...\n");
++
++      /* The PCI backend is not intended to be a module (or to work with
++       * removable PCI devices (yet). If it were, pciback_config_free()
++       * would need to be called somewhere to free the memory allocated
++       * here and then to call kfree(pci_get_drvdata(psdev->dev)).
++       */
++      dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC);
++      if (!dev_data) {
++              err = -ENOMEM;
++              goto out;
++      }
++      pci_set_drvdata(dev, dev_data);
++
++      dev_dbg(&dev->dev, "initializing config\n");
++
++      init_waitqueue_head(&aer_wait_queue);
++      err = pciback_config_init_dev(dev);
++      if (err)
++              goto out;
++
++      /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
++       * must do this here because pcibios_enable_device may specify
++       * the pci device's true irq (and possibly its other resources)
++       * if they differ from what's in the configuration space.
++       * This makes the assumption that the device's resources won't
++       * change after this point (otherwise this code may break!)
++       */
++      dev_dbg(&dev->dev, "enabling device\n");
++      err = pci_enable_device(dev);
++      if (err)
++              goto config_release;
++
++      /* Now disable the device (this also ensures some private device
++       * data is setup before we export)
++       */
++      dev_dbg(&dev->dev, "reset device\n");
++      pciback_reset_device(dev);
++
++      return 0;
++
++      config_release:
++      pciback_config_free_dev(dev);
++
++      out:
++      pci_set_drvdata(dev, NULL);
++      kfree(dev_data);
++      return err;
++}
++
++/*
++ * Because some initialization still happens on
++ * devices during fs_initcall, we need to defer
++ * full initialization of our devices until
++ * device_initcall.
++ */
++static int __init pcistub_init_devices_late(void)
++{
++      struct pcistub_device *psdev;
++      unsigned long flags;
++      int err = 0;
++
++      pr_debug("pciback: pcistub_init_devices_late\n");
++
++      spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++      while (!list_empty(&seized_devices)) {
++              psdev = container_of(seized_devices.next,
++                                   struct pcistub_device, dev_list);
++              list_del(&psdev->dev_list);
++
++              spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++              err = pcistub_init_device(psdev->dev);
++              if (err) {
++                      dev_err(&psdev->dev->dev,
++                              "error %d initializing device\n", err);
++                      kfree(psdev);
++                      psdev = NULL;
++              }
++
++              spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++              if (psdev)
++                      list_add_tail(&psdev->dev_list, &pcistub_devices);
++      }
++
++      initialize_devices = 1;
++
++      spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++      return 0;
++}
++
++static int __devinit pcistub_seize(struct pci_dev *dev)
++{
++      struct pcistub_device *psdev;
++      unsigned long flags;
++      int err = 0;
++
++      psdev = pcistub_device_alloc(dev);
++      if (!psdev)
++              return -ENOMEM;
++
++      spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++      if (initialize_devices) {
++              spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++              /* don't want irqs disabled when calling pcistub_init_device */
++              err = pcistub_init_device(psdev->dev);
++
++              spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++              if (!err)
++                      list_add(&psdev->dev_list, &pcistub_devices);
++      } else {
++              dev_dbg(&dev->dev, "deferring initialization\n");
++              list_add(&psdev->dev_list, &seized_devices);
++      }
++
++      spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++      if (err)
++              pcistub_device_put(psdev);
++
++      return err;
++}
++
++static int __devinit pcistub_probe(struct pci_dev *dev,
++                                 const struct pci_device_id *id)
++{
++      int err = 0;
++
++      dev_dbg(&dev->dev, "probing...\n");
++
++      if (pcistub_match(dev)) {
++
++              if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
++                  && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
++                      dev_err(&dev->dev, "can't export pci devices that "
++                              "don't have a normal (0) or bridge (1) "
++                              "header type!\n");
++                      err = -ENODEV;
++                      goto out;
++              }
++
++              dev_info(&dev->dev, "seizing device\n");
++              err = pcistub_seize(dev);
++#ifdef CONFIG_PCI_GUESTDEV
++      } else if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
++              if (!pci_is_guestdev(dev)) {
++                      err = -ENODEV;
++                      goto out;
++              }
++
++              dev_info(&dev->dev, "seizing device\n");
++              err = pcistub_seize(dev);
++#endif /* CONFIG_PCI_GUESTDEV */
++      } else
++              /* Didn't find the device */
++              err = -ENODEV;
++
++      out:
++      return err;
++}
++
++static void pcistub_remove(struct pci_dev *dev)
++{
++      struct pcistub_device *psdev, *found_psdev = NULL;
++      unsigned long flags;
++
++      dev_dbg(&dev->dev, "removing\n");
++
++      spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++      pciback_config_quirk_release(dev);
++
++      list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++              if (psdev->dev == dev) {
++                      found_psdev = psdev;
++                      break;
++              }
++      }
++
++      spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++      if (found_psdev) {
++              dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
++                      found_psdev->pdev);
++
++              if (found_psdev->pdev) {
++                      pr_warning("pciback: ****** removing device %s"
++                                 " while still in-use! ******\n",
++                                 pci_name(found_psdev->dev));
++                      pr_warning("pciback: ****** driver domain may still"
++                                 " access this device's i/o resources!\n");
++                      pr_warning("pciback: ****** shutdown driver "
++                                 "domain before binding device\n");
++                      pr_warning("pciback: ****** to other drivers "
++                                 "or domains\n");
++
++                      pciback_release_pci_dev(found_psdev->pdev,
++                                              found_psdev->dev);
++              }
++
++              spin_lock_irqsave(&pcistub_devices_lock, flags);
++              list_del(&found_psdev->dev_list);
++              spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++              /* the final put for releasing from the list */
++              pcistub_device_put(found_psdev);
++      }
++}
++
++static DEFINE_PCI_DEVICE_TABLE(pcistub_ids) = {
++      {
++       .vendor = PCI_ANY_ID,
++       .device = PCI_ANY_ID,
++       .subvendor = PCI_ANY_ID,
++       .subdevice = PCI_ANY_ID,
++       },
++      {0,},
++};
++
++static void kill_domain_by_device(struct pcistub_device *psdev)
++{
++      struct xenbus_transaction xbt;
++      int err;
++      char *nodename;
++
++      if (!psdev) {
++              dev_err(&psdev->dev->dev,
++                      "device is NULL when do AER recovery/kill_domain\n");
++              return;
++      }
++
++      nodename = kasprintf(GFP_KERNEL,
++                           "/local/domain/0/backend/pci/%d/0",
++                           psdev->pdev->xdev->otherend_id);
++      if (!nodename) {
++              dev_err(&psdev->dev->dev,
++                      "not enough memory\n");
++              return;
++      }
++
++      do {
++              err = xenbus_transaction_start(&xbt);
++              if (err) {
++                      dev_err(&psdev->dev->dev,
++                              "error %d starting xenbus transaction\n", err);
++                      break;
++              }
++
++              /* PV AER handlers will set this flag */
++              xenbus_printf(xbt, nodename, "aerState" , "aerfail" );
++
++              err = xenbus_transaction_end(xbt, 0);
++              switch (err) {
++              default:
++                      dev_err(&psdev->dev->dev,
++                              "error %d ending xenbus transaction\n", err);
++                      break;
++              case 0:
++              case -EAGAIN:
++                      break;
++              }
++      } while (err == -EAGAIN);
++      kfree(nodename);
++}
++
++/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and
++ * backend need to have cooperation. In pciback, those steps will do similar
++ * jobs: send service request and waiting for front_end response. 
++*/
++static pci_ers_result_t common_process(struct pcistub_device *psdev, 
++              pci_channel_state_t state, int aer_cmd, pci_ers_result_t result)
++{
++      pci_ers_result_t res = result;
++      struct xen_pcie_aer_op *aer_op;
++      int ret;
++
++      /*with PV AER drivers*/
++      aer_op = &(psdev->pdev->sh_info->aer_op);
++      aer_op->cmd = aer_cmd ;
++      /*useful for error_detected callback*/
++      aer_op->err = state;
++      /*pcifront_end BDF*/
++      ret = pciback_get_pcifront_dev(psdev->dev, psdev->pdev,
++              &aer_op->domain, &aer_op->bus, &aer_op->devfn);
++      if (!ret) {
++              dev_err(&psdev->dev->dev,
++                      "pciback: failed to get pcifront device\n");
++              return PCI_ERS_RESULT_NONE; 
++      }
++      wmb();
++
++      dev_dbg(&psdev->dev->dev, 
++                      "pciback: aer_op %x dom %x bus %x devfn %x\n",  
++                      aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn);
++      /*local flag to mark there's aer request, pciback callback will use this
++      * flag to judge whether we need to check pci-front give aer service
++      * ack signal
++      */
++      set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
++
++      /*It is possible that a pcifront conf_read_write ops request invokes
++      * the callback which cause the spurious execution of wake_up. 
++      * Yet it is harmless and better than a spinlock here
++      */
++      set_bit(_XEN_PCIB_active, 
++              (unsigned long *)&psdev->pdev->sh_info->flags);
++      wmb();
++      notify_remote_via_irq(psdev->pdev->evtchn_irq);
++
++      ret = wait_event_timeout(aer_wait_queue, !(test_bit(_XEN_PCIB_active,
++                (unsigned long *)&psdev->pdev->sh_info->flags)), 300*HZ);
++
++      if (!ret) {
++              if (test_bit(_XEN_PCIB_active, 
++                      (unsigned long *)&psdev->pdev->sh_info->flags)) {
++                      dev_err(&psdev->dev->dev, 
++                              "pcifront aer process not responding!\n");
++                      clear_bit(_XEN_PCIB_active,
++                        (unsigned long *)&psdev->pdev->sh_info->flags);
++                      aer_op->err = PCI_ERS_RESULT_NONE;
++                      return res;
++              }
++      }
++      clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
++
++      if ( test_bit( _XEN_PCIF_active,
++              (unsigned long*)&psdev->pdev->sh_info->flags)) {
++              dev_dbg(&psdev->dev->dev, 
++                      "schedule pci_conf service in pciback \n");
++              test_and_schedule_op(psdev->pdev);
++      }
++
++      res = (pci_ers_result_t)aer_op->err;
++      return res;
++} 
++
++/*
++* pciback_slot_reset: it will send the slot_reset request to  pcifront in case
++* of the device driver could provide this service, and then wait for pcifront
++* ack.
++* @dev: pointer to PCI devices
++* return value is used by aer_core do_recovery policy
++*/
++static pci_ers_result_t pciback_slot_reset(struct pci_dev *dev)
++{
++      struct pcistub_device *psdev;
++      pci_ers_result_t result;
++
++      result = PCI_ERS_RESULT_RECOVERED;
++      dev_dbg(&dev->dev, "pciback_slot_reset(bus:%x,devfn:%x)\n",
++              dev->bus->number, dev->devfn);
++
++      down_write(&pcistub_sem);
++      psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++                              dev->bus->number,
++                              PCI_SLOT(dev->devfn),
++                              PCI_FUNC(dev->devfn));
++
++      if ( !psdev || !psdev->pdev )
++      {
++              dev_err(&dev->dev, 
++                      "pciback device is not found/assigned\n");
++              goto end;
++      }
++
++      if ( !psdev->pdev->sh_info )
++      {
++              dev_err(&dev->dev, "pciback device is not connected or owned"
++                      " by HVM, kill it\n");
++              kill_domain_by_device(psdev);
++              goto release;
++      }
++
++      if ( !test_bit(_XEN_PCIB_AERHANDLER, 
++              (unsigned long *)&psdev->pdev->sh_info->flags) ) {
++              dev_err(&dev->dev, 
++                      "guest with no AER driver should have been killed\n");
++              goto release;
++      }
++      result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result);
++
++      if (result == PCI_ERS_RESULT_NONE ||
++              result == PCI_ERS_RESULT_DISCONNECT) {
++              dev_dbg(&dev->dev, 
++                      "No AER slot_reset service or disconnected!\n");
++              kill_domain_by_device(psdev);
++      }
++release:
++      pcistub_device_put(psdev);
++end:
++      up_write(&pcistub_sem);
++      return result;
++
++}
++
++
++/*pciback_mmio_enabled: it will send the mmio_enabled request to  pcifront 
++* in case of the device driver could provide this service, and then wait 
++* for pcifront ack.
++* @dev: pointer to PCI devices
++* return value is used by aer_core do_recovery policy
++*/
++
++static pci_ers_result_t pciback_mmio_enabled(struct pci_dev *dev)
++{
++      struct pcistub_device *psdev;
++      pci_ers_result_t result;
++
++      result = PCI_ERS_RESULT_RECOVERED;
++      dev_dbg(&dev->dev, "pciback_mmio_enabled(bus:%x,devfn:%x)\n",
++              dev->bus->number, dev->devfn);
++
++      down_write(&pcistub_sem);
++      psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++                              dev->bus->number,
++                              PCI_SLOT(dev->devfn),
++                              PCI_FUNC(dev->devfn));
++
++      if ( !psdev || !psdev->pdev )
++      {
++              dev_err(&dev->dev, 
++                      "pciback device is not found/assigned\n");
++              goto end;
++      }
++
++      if ( !psdev->pdev->sh_info )
++      {
++              dev_err(&dev->dev, "pciback device is not connected or owned"
++                      " by HVM, kill it\n");
++              kill_domain_by_device(psdev);
++              goto release;
++      }
++
++      if ( !test_bit(_XEN_PCIB_AERHANDLER, 
++              (unsigned long *)&psdev->pdev->sh_info->flags) ) {
++              dev_err(&dev->dev, 
++                      "guest with no AER driver should have been killed\n");
++              goto release;
++      }
++      result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result);
++
++      if (result == PCI_ERS_RESULT_NONE ||
++              result == PCI_ERS_RESULT_DISCONNECT) {
++              dev_dbg(&dev->dev, 
++                      "No AER mmio_enabled service or disconnected!\n");
++              kill_domain_by_device(psdev);
++      }
++release:
++      pcistub_device_put(psdev);
++end:
++      up_write(&pcistub_sem);
++      return result;
++}
++
++/*pciback_error_detected: it will send the error_detected request to  pcifront 
++* in case of the device driver could provide this service, and then wait 
++* for pcifront ack.
++* @dev: pointer to PCI devices
++* @error: the current PCI connection state
++* return value is used by aer_core do_recovery policy
++*/
++
++static pci_ers_result_t pciback_error_detected(struct pci_dev *dev,
++      pci_channel_state_t error)
++{
++      struct pcistub_device *psdev;
++      pci_ers_result_t result;
++
++      result = PCI_ERS_RESULT_CAN_RECOVER;
++      dev_dbg(&dev->dev, "pciback_error_detected(bus:%x,devfn:%x)\n",
++              dev->bus->number, dev->devfn);
++
++      down_write(&pcistub_sem);
++      psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++                              dev->bus->number,
++                              PCI_SLOT(dev->devfn),
++                              PCI_FUNC(dev->devfn));
++
++      if ( !psdev || !psdev->pdev )
++      {
++              dev_err(&dev->dev, 
++                      "pciback device is not found/assigned\n");
++              goto end;
++      }
++
++      if ( !psdev->pdev->sh_info )
++      {
++              dev_err(&dev->dev, "pciback device is not connected or owned"
++                      " by HVM, kill it\n");
++              kill_domain_by_device(psdev);
++              goto release;
++      }
++
++      /*Guest owns the device yet no aer handler regiested, kill guest*/
++      if ( !test_bit(_XEN_PCIB_AERHANDLER, 
++              (unsigned long *)&psdev->pdev->sh_info->flags) ) {
++              dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n");
++              kill_domain_by_device(psdev);
++              goto release;
++      }
++      result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result);
++
++      if (result == PCI_ERS_RESULT_NONE ||
++              result == PCI_ERS_RESULT_DISCONNECT) {
++              dev_dbg(&dev->dev, 
++                      "No AER error_detected service or disconnected!\n");
++              kill_domain_by_device(psdev);
++      }
++release:
++      pcistub_device_put(psdev);
++end:
++      up_write(&pcistub_sem);
++      return result;
++}
++
++/*pciback_error_resume: it will send the error_resume request to  pcifront 
++* in case of the device driver could provide this service, and then wait 
++* for pcifront ack.
++* @dev: pointer to PCI devices
++*/
++
++static void pciback_error_resume(struct pci_dev *dev)
++{
++      struct pcistub_device *psdev;
++
++      dev_dbg(&dev->dev, "pciback_error_resume(bus:%x,devfn:%x)\n",
++              dev->bus->number, dev->devfn);
++
++      down_write(&pcistub_sem);
++      psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++                              dev->bus->number,
++                              PCI_SLOT(dev->devfn),
++                              PCI_FUNC(dev->devfn));
++
++      if ( !psdev || !psdev->pdev )
++      {
++              dev_err(&dev->dev, 
++                      "pciback device is not found/assigned\n");
++              goto end;
++      }
++
++      if ( !psdev->pdev->sh_info )
++      {
++              dev_err(&dev->dev, "pciback device is not connected or owned"
++                      " by HVM, kill it\n");
++              kill_domain_by_device(psdev);
++              goto release;
++      }
++
++      if ( !test_bit(_XEN_PCIB_AERHANDLER, 
++              (unsigned long *)&psdev->pdev->sh_info->flags) ) {
++              dev_err(&dev->dev, 
++                      "guest with no AER driver should have been killed\n");
++              kill_domain_by_device(psdev);
++              goto release;
++      }
++      common_process(psdev, 1, XEN_PCI_OP_aer_resume, PCI_ERS_RESULT_RECOVERED);
++release:
++      pcistub_device_put(psdev);
++end:
++      up_write(&pcistub_sem);
++      return;
++}
++
++/*add pciback AER handling*/
++static struct pci_error_handlers pciback_error_handler = {
++      .error_detected = pciback_error_detected,
++      .mmio_enabled = pciback_mmio_enabled,
++      .slot_reset = pciback_slot_reset,
++      .resume = pciback_error_resume,
++};
++
++/*
++ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
++ * for a normal device. I don't want it to be loaded automatically.
++ */
++
++static struct pci_driver pciback_pci_driver = {
++      .name = "pciback",
++      .id_table = pcistub_ids,
++      .probe = pcistub_probe,
++      .remove = pcistub_remove,
++      .err_handler = &pciback_error_handler,
++};
++
++static inline int str_to_slot(const char *buf, int *domain, int *bus,
++                            int *slot, int *func)
++{
++      int err;
++
++      err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
++      if (err == 4)
++              return 0;
++      else if (err < 0)
++              return -EINVAL;
++
++      /* try again without domain */
++      *domain = 0;
++      err = sscanf(buf, " %x:%x.%x", bus, slot, func);
++      if (err == 3)
++              return 0;
++
++      return -EINVAL;
++}
++
++static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
++                             *slot, int *func, int *reg, int *size, int *mask)
++{
++      int err;
++
++      err =
++          sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
++                 func, reg, size, mask);
++      if (err == 7)
++              return 0;
++      return -EINVAL;
++}
++
++static int pcistub_device_id_add(int domain, int bus, int slot, int func)
++{
++      struct pcistub_device_id *pci_dev_id;
++      unsigned long flags;
++
++      pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
++      if (!pci_dev_id)
++              return -ENOMEM;
++
++      pci_dev_id->domain = domain;
++      pci_dev_id->bus = bus;
++      pci_dev_id->devfn = PCI_DEVFN(slot, func);
++
++      pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
++               domain, bus, slot, func);
++
++      spin_lock_irqsave(&device_ids_lock, flags);
++      list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
++      spin_unlock_irqrestore(&device_ids_lock, flags);
++
++      return 0;
++}
++
++static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
++{
++      struct pcistub_device_id *pci_dev_id, *t;
++      int devfn = PCI_DEVFN(slot, func);
++      int err = -ENOENT;
++      unsigned long flags;
++
++      spin_lock_irqsave(&device_ids_lock, flags);
++      list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) {
++
++              if (pci_dev_id->domain == domain
++                  && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
++                      /* Don't break; here because it's possible the same
++                       * slot could be in the list more than once
++                       */
++                      list_del(&pci_dev_id->slot_list);
++                      kfree(pci_dev_id);
++
++                      err = 0;
++
++                      pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
++                               "seize list\n", domain, bus, slot, func);
++              }
++      }
++      spin_unlock_irqrestore(&device_ids_lock, flags);
++
++      return err;
++}
++
++static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
++                         int size, int mask)
++{
++      int err = 0;
++      struct pcistub_device *psdev;
++      struct pci_dev *dev;
++      struct config_field *field;
++
++      psdev = pcistub_device_find(domain, bus, slot, func);
++      if (!psdev || !psdev->dev) {
++              err = -ENODEV;
++              goto out;
++      }
++      dev = psdev->dev;
++
++      field = kzalloc(sizeof(*field), GFP_ATOMIC);
++      if (!field) {
++              err = -ENOMEM;
++              goto out;
++      }
++
++      field->offset = reg;
++      field->size = size;
++      field->mask = mask;
++      field->init = NULL;
++      field->reset = NULL;
++      field->release = NULL;
++      field->clean = pciback_config_field_free;
++
++      err = pciback_config_quirks_add_field(dev, field);
++      if (err)
++              kfree(field);
++      out:
++      return err;
++}
++
++static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
++                              size_t count)
++{
++      int domain, bus, slot, func;
++      int err;
++
++      err = str_to_slot(buf, &domain, &bus, &slot, &func);
++      if (err)
++              goto out;
++
++      err = pcistub_device_id_add(domain, bus, slot, func);
++
++      out:
++      if (!err)
++              err = count;
++      return err;
++}
++static DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
++
++static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
++                                 size_t count)
++{
++      int domain, bus, slot, func;
++      int err;
++
++      err = str_to_slot(buf, &domain, &bus, &slot, &func);
++      if (err)
++              goto out;
++
++      err = pcistub_device_id_remove(domain, bus, slot, func);
++
++      out:
++      if (!err)
++              err = count;
++      return err;
++}
++static DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
++
++static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
++{
++      struct pcistub_device_id *pci_dev_id;
++      size_t count = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&device_ids_lock, flags);
++      list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
++              if (count >= PAGE_SIZE)
++                      break;
++
++              count += scnprintf(buf + count, PAGE_SIZE - count,
++                                 "%04x:%02x:%02x.%01x\n",
++                                 pci_dev_id->domain, pci_dev_id->bus,
++                                 PCI_SLOT(pci_dev_id->devfn),
++                                 PCI_FUNC(pci_dev_id->devfn));
++      }
++      spin_unlock_irqrestore(&device_ids_lock, flags);
++
++      return count;
++}
++static DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
++
++static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
++                               size_t count)
++{
++      int domain, bus, slot, func, reg, size, mask;
++      int err;
++
++      err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
++                         &mask);
++      if (err)
++              goto out;
++
++      err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
++
++      out:
++      if (!err)
++              err = count;
++      return err;
++}
++
++static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
++{
++      int count = 0;
++      unsigned long flags;
++      extern struct list_head pciback_quirks;
++      struct pciback_config_quirk *quirk;
++      struct pciback_dev_data *dev_data;
++      const struct config_field *field;
++      const struct config_field_entry *cfg_entry;
++
++      spin_lock_irqsave(&device_ids_lock, flags);
++      list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
++              if (count >= PAGE_SIZE)
++                      goto out;
++
++              count += scnprintf(buf + count, PAGE_SIZE - count,
++                                 "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
++                                 quirk->pdev->bus->number,
++                                 PCI_SLOT(quirk->pdev->devfn),
++                                 PCI_FUNC(quirk->pdev->devfn),
++                                 quirk->devid.vendor, quirk->devid.device,
++                                 quirk->devid.subvendor,
++                                 quirk->devid.subdevice);
++
++              dev_data = pci_get_drvdata(quirk->pdev);
++
++              list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++                      field = cfg_entry->field;
++                      if (count >= PAGE_SIZE)
++                              goto out;
++
++                      count += scnprintf(buf + count, PAGE_SIZE - count,
++                                         "\t\t%08x:%01x:%08x\n",
++                                         cfg_entry->base_offset + field->offset, 
++                                         field->size, field->mask);
++              }
++      }
++
++      out:
++      spin_unlock_irqrestore(&device_ids_lock, flags);
++
++      return count;
++}
++static DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
++
++static ssize_t permissive_add(struct device_driver *drv, const char *buf,
++                            size_t count)
++{
++      int domain, bus, slot, func;
++      int err;
++      struct pcistub_device *psdev;
++      struct pciback_dev_data *dev_data;
++      err = str_to_slot(buf, &domain, &bus, &slot, &func);
++      if (err)
++              goto out;
++      psdev = pcistub_device_find(domain, bus, slot, func);
++      if (!psdev) {
++              err = -ENODEV;
++              goto out;
++      }
++      if (!psdev->dev) {
++              err = -ENODEV;
++              goto release;
++      }
++      dev_data = pci_get_drvdata(psdev->dev);
++      /* the driver data for a device should never be null at this point */
++      if (!dev_data) {
++              err = -ENXIO;
++              goto release;
++      }
++      if (!dev_data->permissive) {
++              dev_data->permissive = 1;
++              /* Let user know that what they're doing could be unsafe */
++              dev_warn(&psdev->dev->dev,
++                       "enabling permissive mode configuration space accesses!\n");
++              dev_warn(&psdev->dev->dev,
++                       "permissive mode is potentially unsafe!\n");
++      }
++      release:
++      pcistub_device_put(psdev);
++      out:
++      if (!err)
++              err = count;
++      return err;
++}
++
++static ssize_t permissive_show(struct device_driver *drv, char *buf)
++{
++      struct pcistub_device *psdev;
++      struct pciback_dev_data *dev_data;
++      size_t count = 0;
++      unsigned long flags;
++      spin_lock_irqsave(&pcistub_devices_lock, flags);
++      list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++              if (count >= PAGE_SIZE)
++                      break;
++              if (!psdev->dev)
++                      continue;
++              dev_data = pci_get_drvdata(psdev->dev);
++              if (!dev_data || !dev_data->permissive)
++                      continue;
++              count +=
++                  scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
++                            pci_name(psdev->dev));
++      }
++      spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++      return count;
++}
++static DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
++
++#ifdef CONFIG_PCI_MSI
++
++int pciback_get_owner(struct pci_dev *dev)
++{
++      struct pcistub_device *psdev;
++
++      psdev = pcistub_device_find(pci_domain_nr(dev->bus), dev->bus->number,
++                      PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
++
++      if (!psdev || !psdev->pdev)
++              return -1;
++
++      return psdev->pdev->xdev->otherend_id;
++}
++#endif
++
++static void pcistub_exit(void)
++{
++      driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
++      driver_remove_file(&pciback_pci_driver.driver,
++                         &driver_attr_remove_slot);
++      driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
++      driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
++      driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
++
++      pci_unregister_driver(&pciback_pci_driver);
++      WARN_ON(unregister_msi_get_owner(pciback_get_owner));
++}
++
++static int __init pcistub_init(void)
++{
++      int pos = 0;
++      int err = 0;
++      int domain, bus, slot, func;
++      int parsed;
++
++      if (pci_devs_to_hide && *pci_devs_to_hide) {
++              do {
++                      parsed = 0;
++
++                      err = sscanf(pci_devs_to_hide + pos,
++                                   " (%x:%x:%x.%x) %n",
++                                   &domain, &bus, &slot, &func, &parsed);
++                      if (err != 4) {
++                              domain = 0;
++                              err = sscanf(pci_devs_to_hide + pos,
++                                           " (%x:%x.%x) %n",
++                                           &bus, &slot, &func, &parsed);
++                              if (err != 3)
++                                      goto parse_error;
++                      }
++
++                      err = pcistub_device_id_add(domain, bus, slot, func);
++                      if (err)
++                              goto out;
++
++                      /* if parsed<=0, we've reached the end of the string */
++                      pos += parsed;
++              } while (parsed > 0 && pci_devs_to_hide[pos]);
++      }
++
++      /* If we're the first PCI Device Driver to register, we're the
++       * first one to get offered PCI devices as they become
++       * available (and thus we can be the first to grab them)
++       */
++      err = pci_register_driver(&pciback_pci_driver);
++      if (err < 0)
++              goto out;
++
++      err = driver_create_file(&pciback_pci_driver.driver,
++                               &driver_attr_new_slot);
++      if (!err)
++              err = driver_create_file(&pciback_pci_driver.driver,
++                                       &driver_attr_remove_slot);
++      if (!err)
++              err = driver_create_file(&pciback_pci_driver.driver,
++                                       &driver_attr_slots);
++      if (!err)
++              err = driver_create_file(&pciback_pci_driver.driver,
++                                       &driver_attr_quirks);
++      if (!err)
++              err = driver_create_file(&pciback_pci_driver.driver,
++                                       &driver_attr_permissive);
++
++      if (!err)
++              err = register_msi_get_owner(pciback_get_owner);
++      if (err)
++              pcistub_exit();
++
++      out:
++      return err;
++
++      parse_error:
++      pr_err("pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
++             pci_devs_to_hide + pos);
++      return -EINVAL;
++}
++
++#ifndef MODULE
++/*
++ * fs_initcall happens before device_initcall
++ * so pciback *should* get called first (b/c we 
++ * want to suck up any device before other drivers
++ * get a chance by being the first pci device
++ * driver to register)
++ */
++fs_initcall(pcistub_init);
++#endif
++
++static int __init pciback_init(void)
++{
++      int err;
++
++      err = pciback_config_init();
++      if (err)
++              return err;
++
++#ifdef MODULE
++      err = pcistub_init();
++      if (err < 0)
++              return err;
++#endif
++
++      pcistub_init_devices_late();
++      err = pciback_xenbus_register();
++      if (err)
++              pcistub_exit();
++
++      return err;
++}
++
++static void __exit pciback_cleanup(void)
++{
++      pciback_xenbus_unregister();
++      pcistub_exit();
++}
++
++module_init(pciback_init);
++module_exit(pciback_cleanup);
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/pciback/pciback.h

index 0000000,0000000..45c83c0

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/pciback.h
@@@ -1,0 -1,0 +1,127 @@@
++/*
++ * PCI Backend Common Data Structures & Function Declarations
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#ifndef __XEN_PCIBACK_H__
++#define __XEN_PCIBACK_H__
++
++#include <linux/slab.h>
++#include <linux/pci.h>
++#include <linux/interrupt.h>
++#include <xen/xenbus.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/workqueue.h>
++#include <asm/atomic.h>
++#include <xen/interface/io/pciif.h>
++
++struct pci_dev_entry {
++      struct list_head list;
++      struct pci_dev *dev;
++};
++
++#define _PDEVF_op_active      (0)
++#define PDEVF_op_active       (1<<(_PDEVF_op_active))
++#define _PCIB_op_pending      (1)
++#define PCIB_op_pending               (1<<(_PCIB_op_pending))
++
++struct pciback_device {
++      void *pci_dev_data;
++      spinlock_t dev_lock;
++
++      struct xenbus_device *xdev;
++
++      struct xenbus_watch be_watch;
++      u8 be_watching;
++
++      int evtchn_irq;
++
++      struct vm_struct *sh_area;
++      struct xen_pci_sharedinfo *sh_info;
++
++      unsigned long flags;
++
++      struct work_struct op_work;
++};
++
++struct pciback_dev_data {
++      struct list_head config_fields;
++      int permissive;
++      int warned_on_write;
++};
++
++/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
++                                          int domain, int bus,
++                                          int slot, int func);
++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
++                                  struct pci_dev *dev);
++void pcistub_put_pci_dev(struct pci_dev *dev);
++
++/* Ensure a device is turned off or reset */
++void pciback_reset_device(struct pci_dev *pdev);
++
++/* Access a virtual configuration space for a PCI device */
++int pciback_config_init(void);
++int pciback_config_init_dev(struct pci_dev *dev);
++void pciback_config_free_dyn_fields(struct pci_dev *dev);
++void pciback_config_reset_dev(struct pci_dev *dev);
++void pciback_config_free_dev(struct pci_dev *dev);
++int pciback_config_read(struct pci_dev *dev, int offset, int size,
++                      u32 * ret_val);
++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
++
++/* Handle requests for specific devices from the frontend */
++typedef int (*publish_pci_dev_cb) (struct pciback_device *pdev,
++                                 unsigned int domain, unsigned int bus,
++                                 unsigned int devfn, unsigned int devid);
++typedef int (*publish_pci_root_cb) (struct pciback_device * pdev,
++                                  unsigned int domain, unsigned int bus);
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++                      int devid, publish_pci_dev_cb publish_cb);
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++                                  unsigned int domain, unsigned int bus,
++                                  unsigned int devfn);
++
++/** 
++* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in pciback
++* before sending aer request to pcifront, so that guest could identify 
++* device, coopearte with pciback to finish aer recovery job if device driver
++* has the capability
++*/
++
++int pciback_get_pcifront_dev(struct pci_dev *pcidev, struct pciback_device *pdev, 
++                              unsigned int *domain, unsigned int *bus, unsigned int *devfn);
++int pciback_init_devices(struct pciback_device *pdev);
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++                            publish_pci_root_cb cb);
++void pciback_release_devices(struct pciback_device *pdev);
++
++/* Handles events from front-end */
++irqreturn_t pciback_handle_event(int irq, void *dev_id);
++void pciback_do_op(struct work_struct *work);
++
++int pciback_xenbus_register(void);
++void pciback_xenbus_unregister(void);
++
++#ifdef CONFIG_PCI_MSI
++int pciback_enable_msi(struct pciback_device *pdev,
++                       struct pci_dev *dev, struct xen_pci_op *op);
++
++int pciback_disable_msi(struct pciback_device *pdev,
++                         struct pci_dev *dev, struct xen_pci_op *op);
++
++
++int pciback_enable_msix(struct pciback_device *pdev,
++                        struct pci_dev *dev, struct xen_pci_op *op);
++
++int pciback_disable_msix(struct pciback_device *pdev,
++                        struct pci_dev *dev, struct xen_pci_op *op);
++#endif
++extern int verbose_request;
++
++void test_and_schedule_op(struct pciback_device *pdev);
++#endif
++
diff --cc drivers/xen/pciback/pciback_ops.c

index 0000000,0000000..9e2ee04

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/pciback_ops.c
@@@ -1,0 -1,0 +1,142 @@@
++/*
++ * PCI Backend Operations - respond to PCI requests from Frontend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/wait.h>
++#include <asm/bitops.h>
++#include <xen/evtchn.h>
++#include "pciback.h"
++
++int verbose_request = 0;
++module_param(verbose_request, int, 0644);
++
++/* Ensure a device is "turned off" and ready to be exported.
++ * (Also see pciback_config_reset to ensure virtual configuration space is
++ * ready to be re-exported)
++ */
++void pciback_reset_device(struct pci_dev *dev)
++{
++      u16 cmd;
++
++      /* Disable devices (but not bridges) */
++      if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
++#ifdef CONFIG_PCI_MSI
++              /* The guest could have been abruptly killed without
++               * disabling MSI/MSI-X interrupts.*/
++              if (dev->msix_enabled)
++                      pci_disable_msix(dev);
++              if (dev->msi_enabled)
++                      pci_disable_msi(dev);
++#endif
++              pci_disable_device(dev);
++
++              pci_write_config_word(dev, PCI_COMMAND, 0);
++
++              atomic_set(&dev->enable_cnt, 0);
++              dev->is_busmaster = 0;
++      } else {
++              pci_read_config_word(dev, PCI_COMMAND, &cmd);
++              if (cmd & (PCI_COMMAND_INVALIDATE)) {
++                      cmd &= ~(PCI_COMMAND_INVALIDATE);
++                      pci_write_config_word(dev, PCI_COMMAND, cmd);
++
++                      dev->is_busmaster = 0;
++              }
++      }
++}
++extern wait_queue_head_t aer_wait_queue;
++extern struct workqueue_struct *pciback_wq;
++/*
++* Now the same evtchn is used for both pcifront conf_read_write request
++* as well as pcie aer front end ack. We use a new work_queue to schedule
++* pciback conf_read_write service for avoiding confict with aer_core 
++* do_recovery job which also use the system default work_queue
++*/
++void test_and_schedule_op(struct pciback_device *pdev)
++{
++      /* Check that frontend is requesting an operation and that we are not
++       * already processing a request */
++      if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
++          && !test_and_set_bit(_PDEVF_op_active, &pdev->flags))
++      {
++              queue_work(pciback_wq, &pdev->op_work);
++      }
++      /*_XEN_PCIB_active should have been cleared by pcifront. And also make
++      sure pciback is waiting for ack by checking _PCIB_op_pending*/
++      if (!test_bit(_XEN_PCIB_active,(unsigned long *)&pdev->sh_info->flags)
++          &&test_bit(_PCIB_op_pending, &pdev->flags)) {
++              wake_up(&aer_wait_queue);
++      }
++}
++
++/* Performing the configuration space reads/writes must not be done in atomic
++ * context because some of the pci_* functions can sleep (mostly due to ACPI
++ * use of semaphores). This function is intended to be called from a work
++ * queue in process context taking a struct pciback_device as a parameter */
++void pciback_do_op(struct work_struct *work)
++{
++      struct pciback_device *pdev = container_of(work, struct pciback_device, op_work);
++      struct pci_dev *dev;
++      struct xen_pci_op *op = &pdev->sh_info->op;
++
++      dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
++
++      if (dev == NULL)
++              op->err = XEN_PCI_ERR_dev_not_found;
++      else
++      {
++              switch (op->cmd)
++              {
++                      case XEN_PCI_OP_conf_read:
++                              op->err = pciback_config_read(dev,
++                                        op->offset, op->size, &op->value);
++                              break;
++                      case XEN_PCI_OP_conf_write:
++                              op->err = pciback_config_write(dev,
++                                        op->offset, op->size, op->value);
++                              break;
++#ifdef CONFIG_PCI_MSI
++                      case XEN_PCI_OP_enable_msi:
++                              op->err = pciback_enable_msi(pdev, dev, op);
++                              break;
++                      case XEN_PCI_OP_disable_msi:
++                              op->err = pciback_disable_msi(pdev, dev, op);
++                              break;
++                      case XEN_PCI_OP_enable_msix:
++                              op->err = pciback_enable_msix(pdev, dev, op);
++                              break;
++                      case XEN_PCI_OP_disable_msix:
++                              op->err = pciback_disable_msix(pdev, dev, op);
++                              break;
++#endif
++                      default:
++                              op->err = XEN_PCI_ERR_not_implemented;
++                              break;
++              }
++      }
++      /* Tell the driver domain that we're done. */ 
++      wmb();
++      clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
++      notify_remote_via_irq(pdev->evtchn_irq);
++
++      /* Mark that we're done. */
++      smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
++      clear_bit(_PDEVF_op_active, &pdev->flags);
++      smp_mb__after_clear_bit(); /* /before/ final check for work */
++
++      /* Check to see if the driver domain tried to start another request in
++       * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. 
++      */
++      test_and_schedule_op(pdev);
++}
++
++irqreturn_t pciback_handle_event(int irq, void *dev_id)
++{
++      struct pciback_device *pdev = dev_id;
++
++      test_and_schedule_op(pdev);
++
++      return IRQ_HANDLED;
++}
diff --cc drivers/xen/pciback/slot.c

index 0000000,0000000..0e46f90

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/slot.c
@@@ -1,0 -1,0 +1,183 @@@
++/*
++ * PCI Backend - Provides a Virtual PCI bus (with real devices)
++ *               to the frontend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c)
++ *   Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c
++ */
++
++#include "pciback.h"
++
++/* There are at most 32 slots in a pci bus.  */
++#define PCI_SLOT_MAX 32
++
++#define PCI_BUS_NBR 2
++
++struct slot_dev_data {
++      /* Access to dev_list must be protected by lock */
++      struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
++      spinlock_t lock;
++};
++
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++                                  unsigned int domain, unsigned int bus,
++                                  unsigned int devfn)
++{
++      struct pci_dev *dev = NULL;
++      struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++      unsigned long flags;
++
++      if (domain != 0 || PCI_FUNC(devfn) != 0)
++              return NULL;
++
++      if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
++              return NULL;
++
++      spin_lock_irqsave(&slot_dev->lock, flags);
++      dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
++      spin_unlock_irqrestore(&slot_dev->lock, flags);
++
++      return dev;
++}
++
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++                      int devid, publish_pci_dev_cb publish_cb)
++{
++      int err = 0, slot, bus;
++      struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++      unsigned long flags;
++
++      if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
++              err = -EFAULT;
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Can't export bridges on the virtual PCI bus");
++              goto out;
++      }
++
++      spin_lock_irqsave(&slot_dev->lock, flags);
++
++      /* Assign to a new slot on the virtual PCI bus */
++      for (bus = 0; bus < PCI_BUS_NBR; bus++)
++              for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++                      if (slot_dev->slots[bus][slot] == NULL) {
++                              pr_info("pciback: slot: %s: assign to"
++                                      " virtual slot %d, bus %d\n",
++                                      pci_name(dev), slot, bus);
++                              slot_dev->slots[bus][slot] = dev;
++                              goto unlock;
++                      }
++              }
++
++      err = -ENOMEM;
++      xenbus_dev_fatal(pdev->xdev, err,
++                       "No more space on root virtual PCI bus");
++
++      unlock:
++      spin_unlock_irqrestore(&slot_dev->lock, flags);
++
++      /* Publish this device. */
++      if(!err)
++              err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid);
++
++      out:
++      return err;
++}
++
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++{
++      int slot, bus;
++      struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++      struct pci_dev *found_dev = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&slot_dev->lock, flags);
++
++      for (bus = 0; bus < PCI_BUS_NBR; bus++)
++              for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++                      if (slot_dev->slots[bus][slot] == dev) {
++                              slot_dev->slots[bus][slot] = NULL;
++                              found_dev = dev;
++                              goto out;
++                      }
++              }
++
++      out:
++      spin_unlock_irqrestore(&slot_dev->lock, flags);
++
++      if (found_dev)
++              pcistub_put_pci_dev(found_dev);
++}
++
++int pciback_init_devices(struct pciback_device *pdev)
++{
++      int slot, bus;
++      struct slot_dev_data *slot_dev;
++
++      slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
++      if (!slot_dev)
++              return -ENOMEM;
++
++      spin_lock_init(&slot_dev->lock);
++
++      for (bus = 0; bus < PCI_BUS_NBR; bus++)
++              for (slot = 0; slot < PCI_SLOT_MAX; slot++)
++                      slot_dev->slots[bus][slot] = NULL;
++
++      pdev->pci_dev_data = slot_dev;
++
++      return 0;
++}
++
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++                            publish_pci_root_cb publish_cb)
++{
++      /* The Virtual PCI bus has only one root */
++      return publish_cb(pdev, 0, 0);
++}
++
++void pciback_release_devices(struct pciback_device *pdev)
++{
++      int slot, bus;
++      struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++      struct pci_dev *dev;
++
++      for (bus = 0; bus < PCI_BUS_NBR; bus++)
++              for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++                      dev = slot_dev->slots[bus][slot];
++                      if (dev != NULL)
++                              pcistub_put_pci_dev(dev);
++              }
++
++      kfree(slot_dev);
++      pdev->pci_dev_data = NULL;
++}
++
++int pciback_get_pcifront_dev(struct pci_dev *pcidev, struct pciback_device *pdev, 
++              unsigned int *domain, unsigned int *bus, unsigned int *devfn)
++{
++      int slot, busnr;
++      struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++      struct pci_dev *dev;
++      int found = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&slot_dev->lock, flags);
++
++      for (busnr = 0; busnr < PCI_BUS_NBR; bus++)
++              for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++                      dev = slot_dev->slots[busnr][slot];
++                      if (dev && dev->bus->number == pcidev->bus->number
++                              && dev->devfn == pcidev->devfn
++                              && pci_domain_nr(dev->bus) == pci_domain_nr(pcidev->bus)) {
++                              found = 1;
++                              *domain = 0;
++                              *bus = busnr;
++                              *devfn = PCI_DEVFN(slot,0);
++                              goto out;
++                      }
++              }
++out:
++      spin_unlock_irqrestore(&slot_dev->lock, flags);
++      return found;
++
++}
diff --cc drivers/xen/pciback/vpci.c

index 0000000,0000000..7a855b3

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/vpci.c
@@@ -1,0 -1,0 +1,238 @@@
++/*
++ * PCI Backend - Provides a Virtual PCI bus (with real devices)
++ *               to the frontend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include "pciback.h"
++
++#define PCI_SLOT_MAX 32
++
++struct vpci_dev_data {
++      /* Access to dev_list must be protected by lock */
++      struct list_head dev_list[PCI_SLOT_MAX];
++      spinlock_t lock;
++};
++
++static inline struct list_head *list_first(struct list_head *head)
++{
++      return head->next;
++}
++
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++                                  unsigned int domain, unsigned int bus,
++                                  unsigned int devfn)
++{
++      struct pci_dev_entry *entry;
++      struct pci_dev *dev = NULL;
++      struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++      unsigned long flags;
++
++      if (domain != 0 || bus != 0)
++              return NULL;
++
++      if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
++              spin_lock_irqsave(&vpci_dev->lock, flags);
++
++              list_for_each_entry(entry,
++                                  &vpci_dev->dev_list[PCI_SLOT(devfn)],
++                                  list) {
++                      if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
++                              dev = entry->dev;
++                              break;
++                      }
++              }
++
++              spin_unlock_irqrestore(&vpci_dev->lock, flags);
++      }
++      return dev;
++}
++
++static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
++{
++      if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
++          && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
++              return 1;
++
++      return 0;
++}
++
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++                      int devid, publish_pci_dev_cb publish_cb)
++{
++      int err = 0, slot, func;
++      struct pci_dev_entry *t, *dev_entry;
++      struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++      unsigned long flags;
++
++      if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
++              err = -EFAULT;
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Can't export bridges on the virtual PCI bus");
++              goto out;
++      }
++
++      dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
++      if (!dev_entry) {
++              err = -ENOMEM;
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error adding entry to virtual PCI bus");
++              goto out;
++      }
++
++      dev_entry->dev = dev;
++
++      spin_lock_irqsave(&vpci_dev->lock, flags);
++
++      /* Keep multi-function devices together on the virtual PCI bus */
++      for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++              if (!list_empty(&vpci_dev->dev_list[slot])) {
++                      t = list_entry(list_first(&vpci_dev->dev_list[slot]),
++                                     struct pci_dev_entry, list);
++
++                      if (match_slot(dev, t->dev)) {
++                              pr_info("pciback: vpci: %s: "
++                                      "assign to virtual slot %d func %d\n",
++                                      pci_name(dev), slot,
++                                      PCI_FUNC(dev->devfn));
++                              list_add_tail(&dev_entry->list,
++                                            &vpci_dev->dev_list[slot]);
++                              func = PCI_FUNC(dev->devfn);
++                              goto unlock;
++                      }
++              }
++      }
++
++      /* Assign to a new slot on the virtual PCI bus */
++      for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++              if (list_empty(&vpci_dev->dev_list[slot])) {
++                      pr_info("pciback: vpci: %s:"
++                              " assign to virtual slot %d\n",
++                              pci_name(dev), slot);
++                      list_add_tail(&dev_entry->list,
++                                    &vpci_dev->dev_list[slot]);
++                      func = PCI_FUNC(dev->devfn);
++                      goto unlock;
++              }
++      }
++
++      err = -ENOMEM;
++      xenbus_dev_fatal(pdev->xdev, err,
++                       "No more space on root virtual PCI bus");
++
++      unlock:
++      spin_unlock_irqrestore(&vpci_dev->lock, flags);
++
++      /* Publish this device. */
++      if(!err)
++              err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
++
++      out:
++      return err;
++}
++
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++{
++      int slot;
++      struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++      struct pci_dev *found_dev = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&vpci_dev->lock, flags);
++
++      for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++              struct pci_dev_entry *e, *tmp;
++              list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
++                                       list) {
++                      if (e->dev == dev) {
++                              list_del(&e->list);
++                              found_dev = e->dev;
++                              kfree(e);
++                              goto out;
++                      }
++              }
++      }
++
++      out:
++      spin_unlock_irqrestore(&vpci_dev->lock, flags);
++
++      if (found_dev)
++              pcistub_put_pci_dev(found_dev);
++}
++
++int pciback_init_devices(struct pciback_device *pdev)
++{
++      int slot;
++      struct vpci_dev_data *vpci_dev;
++
++      vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
++      if (!vpci_dev)
++              return -ENOMEM;
++
++      spin_lock_init(&vpci_dev->lock);
++
++      for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++              INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
++      }
++
++      pdev->pci_dev_data = vpci_dev;
++
++      return 0;
++}
++
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++                            publish_pci_root_cb publish_cb)
++{
++      /* The Virtual PCI bus has only one root */
++      return publish_cb(pdev, 0, 0);
++}
++
++void pciback_release_devices(struct pciback_device *pdev)
++{
++      int slot;
++      struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++
++      for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++              struct pci_dev_entry *e, *tmp;
++              list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
++                                       list) {
++                      list_del(&e->list);
++                      pcistub_put_pci_dev(e->dev);
++                      kfree(e);
++              }
++      }
++
++      kfree(vpci_dev);
++      pdev->pci_dev_data = NULL;
++}
++
++int pciback_get_pcifront_dev(struct pci_dev *pcidev, struct pciback_device *pdev, 
++              unsigned int *domain, unsigned int *bus, unsigned int *devfn)
++{
++      struct pci_dev_entry *entry;
++      struct pci_dev *dev = NULL;
++      struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++      unsigned long flags;
++      int found = 0, slot;
++
++      spin_lock_irqsave(&vpci_dev->lock, flags);
++      for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++              list_for_each_entry(entry,
++                          &vpci_dev->dev_list[slot],
++                          list) {
++                      dev = entry->dev;
++                      if (dev && dev->bus->number == pcidev->bus->number
++                              && pci_domain_nr(dev->bus) == pci_domain_nr(pcidev->bus)
++                              && dev->devfn == pcidev->devfn)
++                      {
++                              found = 1;
++                              *domain = 0;
++                              *bus = 0;
++                              *devfn = PCI_DEVFN(slot, PCI_FUNC(pcidev->devfn));
++                      }
++              }               
++      }
++      spin_unlock_irqrestore(&vpci_dev->lock, flags);
++      return found;
++}
diff --cc drivers/xen/pciback/xenbus.c

index 0000000,0000000..ccc6c80

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pciback/xenbus.c
@@@ -1,0 -1,0 +1,708 @@@
++/*
++ * PCI Backend Xenbus Setup - handles setup with frontend and xend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/list.h>
++#include <linux/vmalloc.h>
++#include <xen/xenbus.h>
++#include <xen/evtchn.h>
++#include "pciback.h"
++
++#define INVALID_EVTCHN_IRQ  (-1)
++struct workqueue_struct *pciback_wq;
++
++static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
++{
++      struct pciback_device *pdev;
++
++      pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
++      if (pdev == NULL)
++              goto out;
++      dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
++
++      pdev->xdev = xdev;
++      dev_set_drvdata(&xdev->dev, pdev);
++
++      spin_lock_init(&pdev->dev_lock);
++
++      pdev->sh_area = NULL;
++      pdev->sh_info = NULL;
++      pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
++      pdev->be_watching = 0;
++
++      INIT_WORK(&pdev->op_work, pciback_do_op);
++
++      if (pciback_init_devices(pdev)) {
++              kfree(pdev);
++              pdev = NULL;
++      }
++      out:
++      return pdev;
++}
++
++static void pciback_disconnect(struct pciback_device *pdev)
++{
++      spin_lock(&pdev->dev_lock);
++
++      /* Ensure the guest can't trigger our handler before removing devices */
++      if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
++              unbind_from_irqhandler(pdev->evtchn_irq, pdev);
++              pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
++      }
++
++      /* If the driver domain started an op, make sure we complete it
++       * before releasing the shared memory */
++      flush_workqueue(pciback_wq);
++
++      if (pdev->sh_info != NULL) {
++              xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_area);
++              pdev->sh_info = NULL;
++      }
++
++      spin_unlock(&pdev->dev_lock);
++}
++
++static void free_pdev(struct pciback_device *pdev)
++{
++      if (pdev->be_watching)
++              unregister_xenbus_watch(&pdev->be_watch);
++
++      pciback_disconnect(pdev);
++
++      pciback_release_devices(pdev);
++
++      dev_set_drvdata(&pdev->xdev->dev, NULL);
++      pdev->xdev = NULL;
++
++      kfree(pdev);
++}
++
++static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
++                           int remote_evtchn)
++{
++      int err = 0;
++      struct vm_struct *area;
++
++      dev_dbg(&pdev->xdev->dev,
++              "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
++              gnt_ref, remote_evtchn);
++
++      area = xenbus_map_ring_valloc(pdev->xdev, gnt_ref);
++      if (IS_ERR(area)) {
++              err = PTR_ERR(area);
++              goto out;
++      }
++      pdev->sh_area = area;
++      pdev->sh_info = area->addr;
++
++      err = bind_interdomain_evtchn_to_irqhandler(
++              pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
++              IRQF_SAMPLE_RANDOM, "pciback", pdev);
++      if (err < 0) {
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error binding event channel to IRQ");
++              goto out;
++      }
++      pdev->evtchn_irq = err;
++      err = 0;
++
++      dev_dbg(&pdev->xdev->dev, "Attached!\n");
++      out:
++      return err;
++}
++
++static int pciback_attach(struct pciback_device *pdev)
++{
++      int err = 0;
++      int gnt_ref, remote_evtchn;
++      char *magic = NULL;
++
++      spin_lock(&pdev->dev_lock);
++
++      /* Make sure we only do this setup once */
++      if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++          XenbusStateInitialised)
++              goto out;
++
++      /* Wait for frontend to state that it has published the configuration */
++      if (xenbus_read_driver_state(pdev->xdev->otherend) !=
++          XenbusStateInitialised)
++              goto out;
++
++      dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
++
++      err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
++                          "pci-op-ref", "%u", &gnt_ref,
++                          "event-channel", "%u", &remote_evtchn,
++                          "magic", NULL, &magic, NULL);
++      if (err) {
++              /* If configuration didn't get read correctly, wait longer */
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error reading configuration from frontend");
++              goto out;
++      }
++
++      if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
++              xenbus_dev_fatal(pdev->xdev, -EFAULT,
++                               "version mismatch (%s/%s) with pcifront - "
++                               "halting pciback",
++                               magic, XEN_PCI_MAGIC);
++              goto out;
++      }
++
++      err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
++      if (err)
++              goto out;
++
++      dev_dbg(&pdev->xdev->dev, "Connecting...\n");
++
++      err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
++      if (err)
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error switching to connected state!");
++
++      dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
++      out:
++      spin_unlock(&pdev->dev_lock);
++
++      if (magic)
++              kfree(magic);
++
++      return err;
++}
++
++static int pciback_publish_pci_dev(struct pciback_device *pdev,
++                                 unsigned int domain, unsigned int bus,
++                                 unsigned int devfn, unsigned int devid)
++{
++      int err;
++      int len;
++      char str[64];
++
++      len = snprintf(str, sizeof(str), "vdev-%d", devid);
++      if (unlikely(len >= (sizeof(str) - 1))) {
++              err = -ENOMEM;
++              goto out;
++      }
++
++      err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++                          "%04x:%02x:%02x.%02x", domain, bus,
++                          PCI_SLOT(devfn), PCI_FUNC(devfn));
++
++      out:
++      return err;
++}
++
++static int pciback_export_device(struct pciback_device *pdev,
++                               int domain, int bus, int slot, int func,
++                               int devid)
++{
++      struct pci_dev *dev;
++      int err = 0;
++
++      dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
++              domain, bus, slot, func);
++
++      dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
++      if (!dev) {
++              err = -EINVAL;
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Couldn't locate PCI device "
++                               "(%04x:%02x:%02x.%01x)! "
++                               "perhaps already in-use?",
++                               domain, bus, slot, func);
++              goto out;
++      }
++
++      err = pciback_add_pci_dev(pdev, dev, devid, pciback_publish_pci_dev);
++      if (err)
++              goto out;
++
++      /* TODO: It'd be nice to export a bridge and have all of its children
++       * get exported with it. This may be best done in xend (which will
++       * have to calculate resource usage anyway) but we probably want to
++       * put something in here to ensure that if a bridge gets given to a
++       * driver domain, that all devices under that bridge are not given
++       * to other driver domains (as he who controls the bridge can disable
++       * it and stop the other devices from working).
++       */
++      out:
++      return err;
++}
++
++static int pciback_remove_device(struct pciback_device *pdev,
++                               int domain, int bus, int slot, int func)
++{
++      int err = 0;
++      struct pci_dev *dev;
++
++      dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
++              domain, bus, slot, func);
++
++      dev = pciback_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
++      if (!dev) {
++              err = -EINVAL;
++              dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
++                      "(%04x:%02x:%02x.%01x)! not owned by this domain\n",
++                      domain, bus, slot, func);
++              goto out;
++      }
++
++      pciback_release_pci_dev(pdev, dev);
++      
++      out:
++      return err;
++}
++
++static int pciback_publish_pci_root(struct pciback_device *pdev,
++                                  unsigned int domain, unsigned int bus)
++{
++      unsigned int d, b;
++      int i, root_num, len, err;
++      char str[64];
++
++      dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
++
++      err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++                         "root_num", "%d", &root_num);
++      if (err == 0 || err == -ENOENT)
++              root_num = 0;
++      else if (err < 0)
++              goto out;
++
++      /* Verify that we haven't already published this pci root */
++      for (i = 0; i < root_num; i++) {
++              len = snprintf(str, sizeof(str), "root-%d", i);
++              if (unlikely(len >= (sizeof(str) - 1))) {
++                      err = -ENOMEM;
++                      goto out;
++              }
++
++              err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++                                 str, "%x:%x", &d, &b);
++              if (err < 0)
++                      goto out;
++              if (err != 2) {
++                      err = -EINVAL;
++                      goto out;
++              }
++
++              if (d == domain && b == bus) {
++                      err = 0;
++                      goto out;
++              }
++      }
++
++      len = snprintf(str, sizeof(str), "root-%d", root_num);
++      if (unlikely(len >= (sizeof(str) - 1))) {
++              err = -ENOMEM;
++              goto out;
++      }
++
++      dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
++              root_num, domain, bus);
++
++      err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++                          "%04x:%02x", domain, bus);
++      if (err)
++              goto out;
++
++      err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
++                          "root_num", "%d", (root_num + 1));
++
++      out:
++      return err;
++}
++
++static int pciback_reconfigure(struct pciback_device *pdev)
++{
++      int err = 0;
++      int num_devs;
++      int domain, bus, slot, func;
++      int substate;
++      int i, len;
++      char state_str[64];
++      char dev_str[64];
++
++      spin_lock(&pdev->dev_lock);
++
++      dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
++
++      /* Make sure we only reconfigure once */
++      if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++          XenbusStateReconfiguring)
++              goto out;
++
++      err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
++                         &num_devs);
++      if (err != 1) {
++              if (err >= 0)
++                      err = -EINVAL;
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error reading number of devices");
++              goto out;
++      }
++
++      for (i = 0; i < num_devs; i++) {
++              len = snprintf(state_str, sizeof(state_str), "state-%d", i);
++              if (unlikely(len >= (sizeof(state_str) - 1))) {
++                      err = -ENOMEM;
++                      xenbus_dev_fatal(pdev->xdev, err,
++                                       "String overflow while reading "
++                                       "configuration");
++                      goto out;
++              }
++              err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str,
++                                 "%d", &substate);
++              if (err != 1) 
++                      substate = XenbusStateUnknown;
++
++              switch (substate) {
++              case XenbusStateInitialising:
++                      dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
++
++                      len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
++                      if (unlikely(len >= (sizeof(dev_str) - 1))) {
++                              err = -ENOMEM;
++                              xenbus_dev_fatal(pdev->xdev, err,
++                                               "String overflow while "
++                                               "reading configuration");
++                              goto out;
++                      }
++                      err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++                                         dev_str, "%x:%x:%x.%x",
++                                         &domain, &bus, &slot, &func);
++                      if (err < 0) {
++                              xenbus_dev_fatal(pdev->xdev, err,
++                                               "Error reading device "
++                                               "configuration");
++                              goto out;
++                      }
++                      if (err != 4) {
++                              err = -EINVAL;
++                              xenbus_dev_fatal(pdev->xdev, err,
++                                               "Error parsing pci device "
++                                               "configuration");
++                              goto out;
++                      }
++      
++                      err = pciback_export_device(pdev, domain, bus, slot,
++                                                  func, i);
++                      if (err)
++                              goto out;
++
++                      /* Publish pci roots. */
++                      err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
++                      if (err) {
++                              xenbus_dev_fatal(pdev->xdev, err,
++                                               "Error while publish PCI root"
++                                               "buses for frontend");
++                              goto out;
++                      }
++
++                      err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
++                                          state_str, "%d",
++                                          XenbusStateInitialised);
++                      if (err) {
++                              xenbus_dev_fatal(pdev->xdev, err,
++                                               "Error switching substate of "
++                                               "dev-%d\n", i);
++                              goto out;
++                      }       
++                      break;
++
++              case XenbusStateClosing:
++                      dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
++
++                      len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
++                      if (unlikely(len >= (sizeof(dev_str) - 1))) {
++                              err = -ENOMEM;
++                              xenbus_dev_fatal(pdev->xdev, err,
++                                               "String overflow while "
++                                               "reading configuration");
++                              goto out;
++                      }
++                      err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++                                         dev_str, "%x:%x:%x.%x",
++                                         &domain, &bus, &slot, &func);
++                      if (err < 0) {
++                              xenbus_dev_fatal(pdev->xdev, err,
++                                               "Error reading device "
++                                               "configuration");
++                              goto out;
++                      }
++                      if (err != 4) {
++                              err = -EINVAL;
++                              xenbus_dev_fatal(pdev->xdev, err,
++                                               "Error parsing pci device "
++                                               "configuration");
++                              goto out;
++                      }
++
++                      err = pciback_remove_device(pdev, domain, bus, slot,
++                                                  func);
++                      if(err)
++                              goto out;
++
++                      /* TODO: If at some point we implement support for pci
++                       * root hot-remove on pcifront side, we'll need to
++                       * remove unnecessary xenstore nodes of pci roots here.
++                       */
++
++                      break;
++
++              default:
++                      break;
++              }
++      }
++
++      err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
++      if (err) {
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error switching to reconfigured state!");
++              goto out;
++      }
++      
++      out:
++      spin_unlock(&pdev->dev_lock);
++
++      return 0;
++}
++
++static void pciback_frontend_changed(struct xenbus_device *xdev,
++                                   enum xenbus_state fe_state)
++{
++      struct pciback_device *pdev = dev_get_drvdata(&xdev->dev);
++
++      dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
++
++      switch (fe_state) {
++      case XenbusStateInitialised:
++              pciback_attach(pdev);
++              break;
++
++      case XenbusStateReconfiguring:
++              pciback_reconfigure(pdev);
++              break;
++
++      case XenbusStateConnected:
++              /* pcifront switched its state from reconfiguring to connected.
++               * Then switch to connected state.
++               */
++              xenbus_switch_state(xdev, XenbusStateConnected);
++              break;
++
++      case XenbusStateClosing:
++              pciback_disconnect(pdev);
++              xenbus_switch_state(xdev, XenbusStateClosing);
++              break;
++
++      case XenbusStateClosed:
++              pciback_disconnect(pdev);
++              xenbus_switch_state(xdev, XenbusStateClosed);
++              if (xenbus_dev_is_online(xdev))
++                      break;
++              /* fall through if not online */
++      case XenbusStateUnknown:
++              dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
++              device_unregister(&xdev->dev);
++              break;
++
++      default:
++              break;
++      }
++}
++
++static int pciback_setup_backend(struct pciback_device *pdev)
++{
++      /* Get configuration from xend (if available now) */
++      int domain, bus, slot, func;
++      int err = 0;
++      int i, num_devs;
++      char dev_str[64];
++      char state_str[64];
++
++      spin_lock(&pdev->dev_lock);
++
++      /* It's possible we could get the call to setup twice, so make sure
++       * we're not already connected.
++       */
++      if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++          XenbusStateInitWait)
++              goto out;
++
++      dev_dbg(&pdev->xdev->dev, "getting be setup\n");
++
++      err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
++                         &num_devs);
++      if (err != 1) {
++              if (err >= 0)
++                      err = -EINVAL;
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error reading number of devices");
++              goto out;
++      }
++
++      for (i = 0; i < num_devs; i++) {
++              int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
++              if (unlikely(l >= (sizeof(dev_str) - 1))) {
++                      err = -ENOMEM;
++                      xenbus_dev_fatal(pdev->xdev, err,
++                                       "String overflow while reading "
++                                       "configuration");
++                      goto out;
++              }
++
++              err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
++                                 "%x:%x:%x.%x", &domain, &bus, &slot, &func);
++              if (err < 0) {
++                      xenbus_dev_fatal(pdev->xdev, err,
++                                       "Error reading device configuration");
++                      goto out;
++              }
++              if (err != 4) {
++                      err = -EINVAL;
++                      xenbus_dev_fatal(pdev->xdev, err,
++                                       "Error parsing pci device "
++                                       "configuration");
++                      goto out;
++              }
++
++              err = pciback_export_device(pdev, domain, bus, slot, func, i);
++              if (err)
++                      goto out;
++
++              /* Switch substate of this device. */
++              l = snprintf(state_str, sizeof(state_str), "state-%d", i);
++              if (unlikely(l >= (sizeof(state_str) - 1))) {
++                      err = -ENOMEM;
++                      xenbus_dev_fatal(pdev->xdev, err,
++                                       "String overflow while reading "
++                                       "configuration");
++                      goto out;
++              }
++              err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
++                                  "%d", XenbusStateInitialised);
++              if (err) {
++                      xenbus_dev_fatal(pdev->xdev, err, "Error switching "
++                                       "substate of dev-%d\n", i);
++                      goto out;
++              }       
++      }
++
++      err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
++      if (err) {
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error while publish PCI root buses "
++                               "for frontend");
++              goto out;
++      }
++
++      err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
++      if (err)
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error switching to initialised state!");
++
++      out:
++      spin_unlock(&pdev->dev_lock);
++
++      if (!err)
++              /* see if pcifront is already configured (if not, we'll wait) */
++              pciback_attach(pdev);
++
++      return err;
++}
++
++static void pciback_be_watch(struct xenbus_watch *watch,
++                           const char **vec, unsigned int len)
++{
++      struct pciback_device *pdev =
++          container_of(watch, struct pciback_device, be_watch);
++
++      switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
++      case XenbusStateInitWait:
++              pciback_setup_backend(pdev);
++              break;
++
++      default:
++              break;
++      }
++}
++
++static int pciback_xenbus_probe(struct xenbus_device *dev,
++                              const struct xenbus_device_id *id)
++{
++      int err = 0;
++      struct pciback_device *pdev = alloc_pdev(dev);
++
++      if (pdev == NULL) {
++              err = -ENOMEM;
++              xenbus_dev_fatal(dev, err,
++                               "Error allocating pciback_device struct");
++              goto out;
++      }
++
++      /* wait for xend to configure us */
++      err = xenbus_switch_state(dev, XenbusStateInitWait);
++      if (err)
++              goto out;
++
++      /* watch the backend node for backend configuration information */
++      err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
++                              pciback_be_watch);
++      if (err)
++              goto out;
++      pdev->be_watching = 1;
++
++      /* We need to force a call to our callback here in case
++       * xend already configured us!
++       */
++      pciback_be_watch(&pdev->be_watch, NULL, 0);
++
++      out:
++      return err;
++}
++
++static int pciback_xenbus_remove(struct xenbus_device *dev)
++{
++      struct pciback_device *pdev = dev_get_drvdata(&dev->dev);
++
++      if (pdev != NULL)
++              free_pdev(pdev);
++
++      return 0;
++}
++
++static const struct xenbus_device_id xenpci_ids[] = {
++      {"pci"},
++      {{0}},
++};
++
++static struct xenbus_driver xenbus_pciback_driver = {
++      .name                   = "pciback",
++      .ids                    = xenpci_ids,
++      .probe                  = pciback_xenbus_probe,
++      .remove                 = pciback_xenbus_remove,
++      .otherend_changed       = pciback_frontend_changed,
++};
++
++int __init pciback_xenbus_register(void)
++{
++      if (!is_running_on_xen())
++              return -ENODEV;
++      pciback_wq = create_workqueue("pciback_workqueue");
++      if (!pciback_wq) {
++              pr_err("pciback_xenbus_register: create workqueue failed\n");
++              return -EFAULT;
++      }
++      return xenbus_register_backend(&xenbus_pciback_driver);
++}
++
++void __exit pciback_xenbus_unregister(void)
++{
++      destroy_workqueue(pciback_wq);
++      xenbus_unregister_driver(&xenbus_pciback_driver);
++}
diff --cc drivers/xen/pcifront/Makefile

index 0000000,0000000..4ceb18a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pcifront/Makefile
@@@ -1,0 -1,0 +1,5 @@@
++obj-y += pcifront.o
++
++pcifront-y := pci_op.o xenbus.o pci.o
++
++ccflags-$(CONFIG_XEN_PCIDEV_FE_DEBUG) += -DDEBUG
diff --cc drivers/xen/pcifront/pci.c

index 0000000,0000000..4239f00

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pcifront/pci.c
@@@ -1,0 -1,0 +1,46 @@@
++/*
++ * PCI Frontend Operations - ensure only one PCI frontend runs at a time
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pcifront.h"
++
++DEFINE_SPINLOCK(pcifront_dev_lock);
++static struct pcifront_device *pcifront_dev = NULL;
++
++int pcifront_connect(struct pcifront_device *pdev)
++{
++      int err = 0;
++
++      spin_lock(&pcifront_dev_lock);
++
++      if (!pcifront_dev) {
++              dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
++              pcifront_dev = pdev;
++      }
++      else {
++              dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
++              err = -EEXIST;
++      }
++
++      spin_unlock(&pcifront_dev_lock);
++
++      return err;
++}
++
++void pcifront_disconnect(struct pcifront_device *pdev)
++{
++      spin_lock(&pcifront_dev_lock);
++
++      if (pdev == pcifront_dev) {
++              dev_info(&pdev->xdev->dev,
++                       "Disconnecting PCI Frontend Buses\n");
++              pcifront_dev = NULL;
++      }
++
++      spin_unlock(&pcifront_dev_lock);
++}
diff --cc drivers/xen/pcifront/pci_op.c

index 0000000,0000000..6f78d9e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pcifront/pci_op.c
@@@ -1,0 -1,0 +1,672 @@@
++/*
++ * PCI Frontend Operations - Communicates with frontend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/version.h>
++#include <linux/init.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include <asm/bitops.h>
++#include <linux/time.h>
++#include <xen/evtchn.h>
++#include "pcifront.h"
++
++static int verbose_request = 0;
++module_param(verbose_request, int, 0644);
++
++#ifdef __ia64__
++static void pcifront_init_sd(struct pcifront_sd *sd,
++                           unsigned int domain, unsigned int bus,
++                           struct pcifront_device *pdev)
++{
++      int err, i, j, k, len, root_num, res_count;
++      struct acpi_resource res;
++      unsigned int d, b, byte;
++      unsigned long magic;
++      char str[64], tmp[3];
++      unsigned char *buf, *bufp;
++      u8 *ptr;
++
++      memset(sd, 0, sizeof(*sd));
++
++      sd->segment = domain;
++      sd->node = -1;  /* Revisit for NUMA */
++      sd->platform_data = pdev;
++
++      /* Look for resources for this controller in xenbus. */
++      err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "root_num",
++                         "%d", &root_num);
++      if (err != 1)
++              return;
++
++      for (i = 0; i < root_num; i++) {
++              len = snprintf(str, sizeof(str), "root-%d", i);
++              if (unlikely(len >= (sizeof(str) - 1)))
++                      return;
++
++              err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
++                                 str, "%x:%x", &d, &b);
++              if (err != 2)
++                      return;
++
++              if (d == domain && b == bus)
++                      break;
++      }
++
++      if (i == root_num)
++              return;
++
++      len = snprintf(str, sizeof(str), "root-resource-magic");
++
++      err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
++                         str, "%lx", &magic);
++
++      if (err != 1)
++              return; /* No resources, nothing to do */
++
++      if (magic != (sizeof(res) * 2) + 1) {
++              pr_warning("pcifront: resource magic mismatch\n");
++              return;
++      }
++
++      len = snprintf(str, sizeof(str), "root-%d-resources", i);
++      if (unlikely(len >= (sizeof(str) - 1)))
++              return;
++
++      err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
++                         str, "%d", &res_count);
++
++      if (err != 1)
++              return; /* No resources, nothing to do */
++
++      sd->window = kzalloc(sizeof(*sd->window) * res_count, GFP_KERNEL);
++      if (!sd->window)
++              return;
++
++      /* magic is also the size of the byte stream in xenbus */
++      buf = kmalloc(magic, GFP_KERNEL);
++      if (!buf) {
++              kfree(sd->window);
++              sd->window = NULL;
++              return;
++      }
++
++      /* Read the resources out of xenbus */
++      for (j = 0; j < res_count; j++) {
++              memset(&res, 0, sizeof(res));
++              memset(buf, 0, magic);
++
++              len = snprintf(str, sizeof(str), "root-%d-resource-%d", i, j);
++              if (unlikely(len >= (sizeof(str) - 1)))
++                      return;
++
++              err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
++                                 "%s", buf);
++              if (err != 1) {
++                      pr_warning("pcifront: error reading "
++                                 "resource %d on bus %04x:%02x\n",
++                                 j, domain, bus);
++                      continue;
++              }
++
++              bufp = buf;
++              ptr = (u8 *)&res;
++              memset(tmp, 0, sizeof(tmp));
++
++              /* Copy ASCII byte stream into structure */
++              for (k = 0; k < magic - 1; k += 2) {
++                      memcpy(tmp, bufp, 2);
++                      bufp += 2;
++
++                      sscanf(tmp, "%02x", &byte);
++                      *ptr = byte;
++                      ptr++;
++              }
++
++              xen_add_resource(sd, domain, bus, &res);
++              sd->windows++;
++      }
++      kfree(buf);
++}
++#endif
++
++static int errno_to_pcibios_err(int errno)
++{
++      switch (errno) {
++      case XEN_PCI_ERR_success:
++              return PCIBIOS_SUCCESSFUL;
++
++      case XEN_PCI_ERR_dev_not_found:
++              return PCIBIOS_DEVICE_NOT_FOUND;
++
++      case XEN_PCI_ERR_invalid_offset:
++      case XEN_PCI_ERR_op_failed:
++              return PCIBIOS_BAD_REGISTER_NUMBER;
++
++      case XEN_PCI_ERR_not_implemented:
++              return PCIBIOS_FUNC_NOT_SUPPORTED;
++
++      case XEN_PCI_ERR_access_denied:
++              return PCIBIOS_SET_FAILED;
++      }
++      return errno;
++}
++
++static inline void schedule_pcifront_aer_op(struct pcifront_device *pdev)
++{
++      if (test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
++              && !test_and_set_bit(_PDEVB_op_active, &pdev->flags)) {
++              dev_dbg(&pdev->xdev->dev, "schedule aer frontend job\n");
++              schedule_work(&pdev->op_work);
++      }
++}
++
++static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
++{
++      int err = 0;
++      struct xen_pci_op *active_op = &pdev->sh_info->op;
++      unsigned long irq_flags;
++      evtchn_port_t port = pdev->evtchn;
++      s64 ns, ns_timeout;
++      struct timeval tv;
++
++      spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
++
++      memcpy(active_op, op, sizeof(struct xen_pci_op));
++
++      /* Go */
++      wmb();
++      set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
++      notify_remote_via_evtchn(port);
++
++      /*
++       * We set a poll timeout of 3 seconds but give up on return after
++       * 2 seconds. It is better to time out too late rather than too early
++       * (in the latter case we end up continually re-executing poll() with a
++       * timeout in the past). 1s difference gives plenty of slack for error.
++       */
++      do_gettimeofday(&tv);
++      ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC;
++
++      clear_evtchn(port);
++
++      while (test_bit(_XEN_PCIF_active,
++                      (unsigned long *)&pdev->sh_info->flags)) {
++              if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
++                      BUG();
++              clear_evtchn(port);
++              do_gettimeofday(&tv);
++              ns = timeval_to_ns(&tv);
++              if (ns > ns_timeout) {
++                      dev_err(&pdev->xdev->dev,
++                              "pciback not responding!!!\n");
++                      clear_bit(_XEN_PCIF_active,
++                                (unsigned long *)&pdev->sh_info->flags);
++                      err = XEN_PCI_ERR_dev_not_found;
++                      goto out;
++              }
++      }
++
++      /*
++      * We might lose backend service request since we 
++      * reuse same evtchn with pci_conf backend response. So re-schedule
++      * aer pcifront service.
++      */
++      if (test_bit(_XEN_PCIB_active, 
++                      (unsigned long*)&pdev->sh_info->flags)) {
++              dev_err(&pdev->xdev->dev, 
++                      "schedule aer pcifront service\n");
++              schedule_pcifront_aer_op(pdev);
++      }
++
++      memcpy(op, active_op, sizeof(struct xen_pci_op));
++
++      err = op->err;
++      out:
++      spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
++      return err;
++}
++
++/* Access to this function is spinlocked in drivers/pci/access.c */
++static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
++                           int where, int size, u32 * val)
++{
++      int err = 0;
++      struct xen_pci_op op = {
++              .cmd    = XEN_PCI_OP_conf_read,
++              .domain = pci_domain_nr(bus),
++              .bus    = bus->number,
++              .devfn  = devfn,
++              .offset = where,
++              .size   = size,
++      };
++      struct pcifront_sd *sd = bus->sysdata;
++      struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++      if (verbose_request)
++              dev_info(&pdev->xdev->dev,
++                       "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
++                       pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
++                       PCI_FUNC(devfn), where, size);
++
++      err = do_pci_op(pdev, &op);
++
++      if (likely(!err)) {
++              if (verbose_request)
++                      dev_info(&pdev->xdev->dev, "read got back value %x\n",
++                               op.value);
++
++              *val = op.value;
++      } else if (err == -ENODEV) {
++              /* No device here, pretend that it just returned 0 */
++              err = 0;
++              *val = 0;
++      }
++
++      return errno_to_pcibios_err(err);
++}
++
++/* Access to this function is spinlocked in drivers/pci/access.c */
++static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
++                            int where, int size, u32 val)
++{
++      struct xen_pci_op op = {
++              .cmd    = XEN_PCI_OP_conf_write,
++              .domain = pci_domain_nr(bus),
++              .bus    = bus->number,
++              .devfn  = devfn,
++              .offset = where,
++              .size   = size,
++              .value  = val,
++      };
++      struct pcifront_sd *sd = bus->sysdata;
++      struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++      if (verbose_request)
++              dev_info(&pdev->xdev->dev,
++                       "write dev=%04x:%02x:%02x.%01x - "
++                       "offset %x size %d val %x\n",
++                       pci_domain_nr(bus), bus->number,
++                       PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
++
++      return errno_to_pcibios_err(do_pci_op(pdev, &op));
++}
++
++struct pci_ops pcifront_bus_ops = {
++      .read = pcifront_bus_read,
++      .write = pcifront_bus_write,
++};
++
++#ifdef CONFIG_PCI_MSI
++int pci_frontend_enable_msix(struct pci_dev *dev,
++              struct msix_entry *entries,
++              int nvec)
++{
++      int err;
++      int i;
++      struct xen_pci_op op = {
++              .cmd    = XEN_PCI_OP_enable_msix,
++              .domain = pci_domain_nr(dev->bus),
++              .bus = dev->bus->number,
++              .devfn = dev->devfn,
++              .value = nvec,
++      };
++      struct pcifront_sd *sd = dev->bus->sysdata;
++      struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++      if (nvec > SH_INFO_MAX_VEC) {
++              pr_warning("too many vectors (%#x) for pci frontend\n", nvec);
++              return -EINVAL;
++      }
++
++      for (i = 0; i < nvec; i++) {
++              op.msix_entries[i].entry = entries[i].entry;
++              op.msix_entries[i].vector = entries[i].vector;
++      }
++
++      err = do_pci_op(pdev, &op);
++
++      if (!err) {
++              if (!op.value) {
++                      /* we get the result */
++                      for ( i = 0; i < nvec; i++)
++                              entries[i].vector = op.msix_entries[i].vector;
++                      return 0;
++              }
++              else {
++                      pr_err("enable msix get value %#x\n", op.value);
++                      return op.value;
++              }
++      }
++      else {
++              pr_err("enable msix err %#x\n", err);
++              return err;
++      }
++}
++
++void pci_frontend_disable_msix(struct pci_dev* dev)
++{
++      int err;
++      struct xen_pci_op op = {
++              .cmd    = XEN_PCI_OP_disable_msix,
++              .domain = pci_domain_nr(dev->bus),
++              .bus = dev->bus->number,
++              .devfn = dev->devfn,
++      };
++      struct pcifront_sd *sd = dev->bus->sysdata;
++      struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++      err = do_pci_op(pdev, &op);
++
++      /* What should do for error ? */
++      if (err)
++              pr_err("disable msix err %#x\n", err);
++}
++
++int pci_frontend_enable_msi(struct pci_dev *dev)
++{
++      int err;
++      struct xen_pci_op op = {
++              .cmd    = XEN_PCI_OP_enable_msi,
++              .domain = pci_domain_nr(dev->bus),
++              .bus = dev->bus->number,
++              .devfn = dev->devfn,
++      };
++      struct pcifront_sd *sd = dev->bus->sysdata;
++      struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++      err = do_pci_op(pdev, &op);
++      if (likely(!err)) {
++              dev->irq = op.value;
++      }
++      else {
++              pr_err("pci frontend enable msi failed for dev %x:%x\n",
++                     op.bus, op.devfn);
++              err = -EINVAL;
++      }
++      return err;
++}
++
++void pci_frontend_disable_msi(struct pci_dev* dev)
++{
++      int err;
++      struct xen_pci_op op = {
++              .cmd    = XEN_PCI_OP_disable_msi,
++              .domain = pci_domain_nr(dev->bus),
++              .bus = dev->bus->number,
++              .devfn = dev->devfn,
++      };
++      struct pcifront_sd *sd = dev->bus->sysdata;
++      struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++      err = do_pci_op(pdev, &op);
++      if (err == XEN_PCI_ERR_dev_not_found) {
++              /* XXX No response from backend, what shall we do? */
++              pr_err("no response from backend for disable MSI\n");
++              return;
++      }
++      if (likely(!err))
++              dev->irq = op.value;
++      else
++              /* how can pciback notify us fail? */
++              pr_err("got bogus response from backend\n");
++}
++#endif /* CONFIG_PCI_MSI */
++
++/* Claim resources for the PCI frontend as-is, backend won't allow changes */
++static int __devinit pcifront_claim_resource(struct pci_dev *dev, void *data)
++{
++      struct pcifront_device *pdev = data;
++      int i;
++      struct resource *r;
++
++      for (i = 0; i < PCI_NUM_RESOURCES; i++) {
++              r = &dev->resource[i];
++
++              if (!r->parent && r->start && r->flags) {
++                      dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n",
++                              pci_name(dev), i);
++                      pci_claim_resource(dev, i);
++              }
++      }
++
++      return 0;
++}
++
++int __devinit pcifront_scan_root(struct pcifront_device *pdev,
++                               unsigned int domain, unsigned int bus)
++{
++      struct pci_bus *b;
++      struct pcifront_sd *sd = NULL;
++      struct pci_bus_entry *bus_entry = NULL;
++      int err = 0;
++
++#ifndef CONFIG_PCI_DOMAINS
++      if (domain != 0) {
++              dev_err(&pdev->xdev->dev,
++                      "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
++              dev_err(&pdev->xdev->dev,
++                      "Please compile with CONFIG_PCI_DOMAINS\n");
++              err = -EINVAL;
++              goto err_out;
++      }
++#endif
++
++      dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
++               domain, bus);
++
++      bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
++      sd = kmalloc(sizeof(*sd), GFP_KERNEL);
++      if (!bus_entry || !sd) {
++              err = -ENOMEM;
++              goto err_out;
++      }
++      pcifront_init_sd(sd, domain, bus, pdev);
++
++      b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
++                                &pcifront_bus_ops, sd);
++      if (!b) {
++              dev_err(&pdev->xdev->dev,
++                      "Error creating PCI Frontend Bus!\n");
++              err = -ENOMEM;
++              goto err_out;
++      }
++
++      pcifront_setup_root_resources(b, sd);
++      bus_entry->bus = b;
++
++      list_add(&bus_entry->list, &pdev->root_buses);
++
++      /* Claim resources before going "live" with our devices */
++      pci_walk_bus(b, pcifront_claim_resource, pdev);
++
++      pci_bus_add_devices(b);
++
++      return 0;
++
++      err_out:
++      kfree(bus_entry);
++      kfree(sd);
++
++      return err;
++}
++
++int __devinit pcifront_rescan_root(struct pcifront_device *pdev,
++                                 unsigned int domain, unsigned int bus)
++{
++      struct pci_bus *b;
++      struct pci_dev *d;
++      unsigned int devfn;
++
++#ifndef CONFIG_PCI_DOMAINS
++      if (domain != 0) {
++              dev_err(&pdev->xdev->dev,
++                      "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
++              dev_err(&pdev->xdev->dev,
++                      "Please compile with CONFIG_PCI_DOMAINS\n");
++              return -EINVAL;
++      }
++#endif
++
++      dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
++               domain, bus);
++
++      b = pci_find_bus(domain, bus);
++      if(!b)
++              /* If the bus is unknown, create it. */
++              return pcifront_scan_root(pdev, domain, bus);
++
++      /* Rescan the bus for newly attached functions and add.
++       * We omit handling of PCI bridge attachment because pciback prevents
++       * bridges from being exported.
++       */ 
++      for (devfn = 0; devfn < 0x100; devfn++) {
++              d = pci_get_slot(b, devfn);
++              if(d) {
++                      /* Device is already known. */
++                      pci_dev_put(d);
++                      continue;
++              }
++
++              d = pci_scan_single_device(b, devfn);
++              if (d)
++                      dev_info(&pdev->xdev->dev, "New device on "
++                               "%04x:%02x:%02x.%02x found.\n", domain, bus,
++                               PCI_SLOT(devfn), PCI_FUNC(devfn));
++      }
++
++      /* Claim resources before going "live" with our devices */
++      pci_walk_bus(b, pcifront_claim_resource, pdev);
++
++      /* Create SysFS and notify udev of the devices. Aka: "going live" */
++      pci_bus_add_devices(b);
++
++      return 0;
++}
++
++static void free_root_bus_devs(struct pci_bus *bus)
++{
++      struct pci_dev *dev;
++
++      while (!list_empty(&bus->devices)) {
++              dev = container_of(bus->devices.next, struct pci_dev,
++                                 bus_list);
++              dev_dbg(&dev->dev, "removing device\n");
++              pci_remove_bus_device(dev);
++      }
++}
++
++void pcifront_free_roots(struct pcifront_device *pdev)
++{
++      struct pci_bus_entry *bus_entry, *t;
++
++      dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
++
++      list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
++              list_del(&bus_entry->list);
++
++              free_root_bus_devs(bus_entry->bus);
++
++              kfree(bus_entry->bus->sysdata);
++
++              device_unregister(bus_entry->bus->bridge);
++              pci_remove_bus(bus_entry->bus);
++
++              kfree(bus_entry);
++      }
++}
++
++static pci_ers_result_t pcifront_common_process( int cmd, struct pcifront_device *pdev,
++      pci_channel_state_t state)
++{
++      pci_ers_result_t result;
++      struct pci_driver *pdrv;
++      int bus = pdev->sh_info->aer_op.bus;
++      int devfn = pdev->sh_info->aer_op.devfn;
++      struct pci_dev *pcidev;
++      int flag = 0;
++
++      dev_dbg(&pdev->xdev->dev, 
++              "pcifront AER process: cmd %x (bus:%x, devfn%x)",
++              cmd, bus, devfn);
++      result = PCI_ERS_RESULT_NONE;
++
++      pcidev = pci_get_bus_and_slot(bus, devfn);
++      if (!pcidev || !pcidev->driver) {
++              pci_dev_put(pcidev);
++              dev_err(&pdev->xdev->dev, "AER device or driver is NULL\n");
++              return result;
++      }
++      pdrv = pcidev->driver;
++
++      if (get_driver(&pdrv->driver)) {
++              if (pdrv->err_handler && pdrv->err_handler->error_detected) {
++                      dev_dbg(&pcidev->dev,
++                              "trying to call AER service\n");
++                      if (pcidev) {
++                              flag = 1;
++                              switch(cmd) {
++                              case XEN_PCI_OP_aer_detected:
++                                      result = pdrv->err_handler->error_detected(pcidev, state);
++                                      break;
++                              case XEN_PCI_OP_aer_mmio:
++                                      result = pdrv->err_handler->mmio_enabled(pcidev);
++                                      break;
++                              case XEN_PCI_OP_aer_slotreset:
++                                      result = pdrv->err_handler->slot_reset(pcidev);
++                                      break;
++                              case XEN_PCI_OP_aer_resume:
++                                      pdrv->err_handler->resume(pcidev);
++                                      break;
++                              default:
++                                      dev_err(&pdev->xdev->dev,
++                                              "bad request in aer recovery operation!\n");
++
++                              }
++                      }
++              }
++              put_driver(&pdrv->driver);
++      }
++      if (!flag)
++              result = PCI_ERS_RESULT_NONE;
++
++      return result;
++}
++
++
++void pcifront_do_aer(struct work_struct *data)
++{
++      struct pcifront_device *pdev = container_of(data, struct pcifront_device, op_work);
++      int cmd = pdev->sh_info->aer_op.cmd;
++      pci_channel_state_t state = 
++              (pci_channel_state_t)pdev->sh_info->aer_op.err;
++
++      /*If a pci_conf op is in progress, 
++              we have to wait until it is done before service aer op*/
++      dev_dbg(&pdev->xdev->dev, 
++              "pcifront service aer bus %x devfn %x\n", pdev->sh_info->aer_op.bus,
++              pdev->sh_info->aer_op.devfn);
++
++      pdev->sh_info->aer_op.err = pcifront_common_process(cmd, pdev, state);
++
++      wmb();
++      clear_bit(_XEN_PCIB_active, (unsigned long*)&pdev->sh_info->flags);
++      notify_remote_via_evtchn(pdev->evtchn);
++
++      /*in case of we lost an aer request in four lines time_window*/
++      smp_mb__before_clear_bit();
++      clear_bit( _PDEVB_op_active, &pdev->flags);
++      smp_mb__after_clear_bit();
++
++      schedule_pcifront_aer_op(pdev);
++
++}
++
++irqreturn_t pcifront_handler_aer(int irq, void *dev)
++{
++      struct pcifront_device *pdev = dev;
++      schedule_pcifront_aer_op(pdev);
++      return IRQ_HANDLED;
++}
diff --cc drivers/xen/pcifront/pcifront.h

index 0000000,0000000..024090f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pcifront/pcifront.h
@@@ -1,0 -1,0 +1,57 @@@
++/*
++ * PCI Frontend - Common data structures & function declarations
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#ifndef __XEN_PCIFRONT_H__
++#define __XEN_PCIFRONT_H__
++
++#include <linux/slab.h>
++#include <linux/spinlock.h>
++#include <linux/pci.h>
++#include <xen/xenbus.h>
++#include <xen/interface/io/pciif.h>
++#include <linux/interrupt.h>
++#include <xen/pcifront.h>
++#include <asm/atomic.h>
++#include <linux/workqueue.h>
++
++struct pci_bus_entry {
++      struct list_head list;
++      struct pci_bus *bus;
++};
++
++#define _PDEVB_op_active              (0)
++#define PDEVB_op_active               (1 << (_PDEVB_op_active))
++
++struct pcifront_device {
++      struct xenbus_device *xdev;
++      struct list_head root_buses;
++      spinlock_t dev_lock;
++
++      int evtchn;
++      int gnt_ref;
++      int irq;
++
++      /* Lock this when doing any operations in sh_info */
++      spinlock_t sh_info_lock;
++      struct xen_pci_sharedinfo *sh_info;
++      struct work_struct op_work;
++      unsigned long flags;
++
++};
++
++int pcifront_connect(struct pcifront_device *pdev);
++void pcifront_disconnect(struct pcifront_device *pdev);
++
++int pcifront_scan_root(struct pcifront_device *pdev,
++                     unsigned int domain, unsigned int bus);
++int pcifront_rescan_root(struct pcifront_device *pdev,
++                       unsigned int domain, unsigned int bus);
++void pcifront_free_roots(struct pcifront_device *pdev);
++
++void pcifront_do_aer(struct work_struct *data);
++
++irqreturn_t pcifront_handler_aer(int irq, void *dev);
++
++#endif        /* __XEN_PCIFRONT_H__ */
diff --cc drivers/xen/pcifront/xenbus.c

index 0000000,0000000..413deed

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/pcifront/xenbus.c
@@@ -1,0 -1,0 +1,478 @@@
++/*
++ * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/mm.h>
++#include <xen/xenbus.h>
++#include <xen/evtchn.h>
++#include <xen/gnttab.h>
++#include "pcifront.h"
++
++#define INVALID_GRANT_REF (0)
++#define INVALID_EVTCHN    (-1)
++
++static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
++{
++      struct pcifront_device *pdev;
++
++      pdev = kzalloc(sizeof(struct pcifront_device), GFP_KERNEL);
++      if (pdev == NULL)
++              goto out;
++
++      pdev->sh_info =
++          (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
++      if (pdev->sh_info == NULL) {
++              kfree(pdev);
++              pdev = NULL;
++              goto out;
++      }
++      pdev->sh_info->flags = 0;
++
++      /*Flag for registering PV AER handler*/
++      set_bit(_XEN_PCIB_AERHANDLER, (void*)&pdev->sh_info->flags);
++
++      dev_set_drvdata(&xdev->dev, pdev);
++      pdev->xdev = xdev;
++
++      INIT_LIST_HEAD(&pdev->root_buses);
++
++      spin_lock_init(&pdev->dev_lock);
++      spin_lock_init(&pdev->sh_info_lock);
++
++      pdev->evtchn = INVALID_EVTCHN;
++      pdev->gnt_ref = INVALID_GRANT_REF;
++      pdev->irq = -1;
++
++      INIT_WORK(&pdev->op_work, pcifront_do_aer);
++
++      dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
++              pdev, pdev->sh_info);
++      out:
++      return pdev;
++}
++
++static void free_pdev(struct pcifront_device *pdev)
++{
++      dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
++
++      pcifront_free_roots(pdev);
++
++      /*For PCIE_AER error handling job*/
++      flush_work_sync(&pdev->op_work);
++
++      if (pdev->irq > 0)
++              unbind_from_irqhandler(pdev->irq, pdev);
++
++      if (pdev->evtchn != INVALID_EVTCHN)
++              xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
++
++      if (pdev->gnt_ref != INVALID_GRANT_REF)
++              gnttab_end_foreign_access(pdev->gnt_ref,
++                                        (unsigned long)pdev->sh_info);
++      else
++              free_page((unsigned long)pdev->sh_info);
++
++      dev_set_drvdata(&pdev->xdev->dev, NULL);
++
++      kfree(pdev);
++}
++
++static int pcifront_publish_info(struct pcifront_device *pdev)
++{
++      int err = 0;
++      struct xenbus_transaction trans;
++
++      err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
++      if (err < 0)
++              goto out;
++
++      pdev->gnt_ref = err;
++
++      err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
++      if (err)
++              goto out;
++
++      err = bind_caller_port_to_irqhandler(pdev->evtchn,
++                                           pcifront_handler_aer,
++                                           IRQF_SAMPLE_RANDOM,
++                                           "pcifront", pdev);
++      if (err < 0) {
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Failed to bind event channel");
++              goto out;
++      }
++      pdev->irq = err;
++
++      do_publish:
++      err = xenbus_transaction_start(&trans);
++      if (err) {
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error writing configuration for backend "
++                               "(start transaction)");
++              goto out;
++      }
++
++      err = xenbus_printf(trans, pdev->xdev->nodename,
++                          "pci-op-ref", "%u", pdev->gnt_ref);
++      if (!err)
++              err = xenbus_printf(trans, pdev->xdev->nodename,
++                                  "event-channel", "%u", pdev->evtchn);
++      if (!err)
++              err = xenbus_printf(trans, pdev->xdev->nodename,
++                                  "magic", XEN_PCI_MAGIC);
++
++      if (err) {
++              xenbus_transaction_end(trans, 1);
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error writing configuration for backend");
++              goto out;
++      } else {
++              err = xenbus_transaction_end(trans, 0);
++              if (err == -EAGAIN)
++                      goto do_publish;
++              else if (err) {
++                      xenbus_dev_fatal(pdev->xdev, err,
++                                       "Error completing transaction "
++                                       "for backend");
++                      goto out;
++              }
++      }
++
++      xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
++
++      dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
++
++      out:
++      return err;
++}
++
++static int __devinit pcifront_try_connect(struct pcifront_device *pdev)
++{
++      int err = -EFAULT;
++      int i, num_roots, len;
++      char str[64];
++      unsigned int domain, bus;
++
++      spin_lock(&pdev->dev_lock);
++
++      /* Only connect once */
++      if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++          XenbusStateInitialised)
++              goto out;
++
++      err = pcifront_connect(pdev);
++      if (err) {
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error connecting PCI Frontend");
++              goto out;
++      }
++
++      err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
++                         "root_num", "%d", &num_roots);
++      if (err == -ENOENT) {
++              xenbus_dev_error(pdev->xdev, err,
++                               "No PCI Roots found, trying 0000:00");
++              err = pcifront_scan_root(pdev, 0, 0);
++              num_roots = 0;
++      } else if (err != 1) {
++              if (err == 0)
++                      err = -EINVAL;
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error reading number of PCI roots");
++              goto out;
++      }
++
++      for (i = 0; i < num_roots; i++) {
++              len = snprintf(str, sizeof(str), "root-%d", i);
++              if (unlikely(len >= (sizeof(str) - 1))) {
++                      err = -ENOMEM;
++                      goto out;
++              }
++
++              err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
++                                 "%x:%x", &domain, &bus);
++              if (err != 2) {
++                      if (err >= 0)
++                              err = -EINVAL;
++                      xenbus_dev_fatal(pdev->xdev, err,
++                                       "Error reading PCI root %d", i);
++                      goto out;
++              }
++
++              err = pcifront_scan_root(pdev, domain, bus);
++              if (err) {
++                      xenbus_dev_fatal(pdev->xdev, err,
++                                       "Error scanning PCI root %04x:%02x",
++                                       domain, bus);
++                      goto out;
++              }
++      }
++
++      err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
++      if (err)
++              goto out;
++
++      out:
++      spin_unlock(&pdev->dev_lock);
++      return err;
++}
++
++static int pcifront_try_disconnect(struct pcifront_device *pdev)
++{
++      int err = 0;
++      enum xenbus_state prev_state;
++
++      spin_lock(&pdev->dev_lock);
++
++      prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
++
++      if (prev_state >= XenbusStateClosing)
++              goto out;
++
++      if(prev_state == XenbusStateConnected) {
++              pcifront_free_roots(pdev);
++              pcifront_disconnect(pdev);
++      }
++
++      err = xenbus_switch_state(pdev->xdev, XenbusStateClosed);
++
++      out:
++      spin_unlock(&pdev->dev_lock);
++
++      return err;
++}
++
++static int __devinit pcifront_attach_devices(struct pcifront_device *pdev)
++{
++      int err = -EFAULT;
++      int i, num_roots, len;
++      unsigned int domain, bus;
++      char str[64];
++
++      spin_lock(&pdev->dev_lock);
++
++      if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++          XenbusStateReconfiguring)
++              goto out;
++
++      err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
++                         "root_num", "%d", &num_roots);
++      if (err == -ENOENT) {
++              xenbus_dev_error(pdev->xdev, err,
++                               "No PCI Roots found, trying 0000:00");
++              err = pcifront_rescan_root(pdev, 0, 0);
++              num_roots = 0;
++      } else if (err != 1) {
++              if (err == 0)
++                      err = -EINVAL;
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error reading number of PCI roots");
++              goto out;
++      }
++
++      for (i = 0; i < num_roots; i++) {
++              len = snprintf(str, sizeof(str), "root-%d", i);
++              if (unlikely(len >= (sizeof(str) - 1))) {
++                      err = -ENOMEM;
++                      goto out;
++              }
++
++              err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
++                                 "%x:%x", &domain, &bus);
++              if (err != 2) {
++                      if (err >= 0)
++                              err = -EINVAL;
++                      xenbus_dev_fatal(pdev->xdev, err,
++                                       "Error reading PCI root %d", i);
++                      goto out;
++              }
++
++              err = pcifront_rescan_root(pdev, domain, bus);
++              if (err) {
++                      xenbus_dev_fatal(pdev->xdev, err,
++                                       "Error scanning PCI root %04x:%02x",
++                                       domain, bus);
++                      goto out;
++              }
++      }
++
++      xenbus_switch_state(pdev->xdev, XenbusStateConnected);
++
++      out:
++      spin_unlock(&pdev->dev_lock);
++      return err;
++}
++
++static int pcifront_detach_devices(struct pcifront_device *pdev)
++{
++      int err = 0;
++      int i, num_devs;
++      unsigned int domain, bus, slot, func;
++      struct pci_bus *pci_bus;
++      struct pci_dev *pci_dev;
++      char str[64];
++
++      spin_lock(&pdev->dev_lock);
++
++      if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++          XenbusStateConnected)
++              goto out;
++
++      err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d",
++                         &num_devs);
++      if (err != 1) {
++              if (err >= 0)
++                      err = -EINVAL;
++              xenbus_dev_fatal(pdev->xdev, err,
++                               "Error reading number of PCI devices");
++              goto out;
++      }
++
++      /* Find devices being detached and remove them. */
++      for (i = 0; i < num_devs; i++) {
++              int l, state;
++              l = snprintf(str, sizeof(str), "state-%d", i);
++              if (unlikely(l >= (sizeof(str) - 1))) {
++                      err = -ENOMEM;
++                      goto out;
++              }
++              err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, "%d",
++                                 &state);
++              if (err != 1)
++                      state = XenbusStateUnknown;
++
++              if (state != XenbusStateClosing)
++                      continue;
++
++              /* Remove device. */
++              l = snprintf(str, sizeof(str), "vdev-%d", i);
++              if (unlikely(l >= (sizeof(str) - 1))) {
++                      err = -ENOMEM;
++                      goto out;
++              }
++              err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
++                                 "%x:%x:%x.%x", &domain, &bus, &slot, &func);
++              if (err != 4) {
++                      if (err >= 0)
++                              err = -EINVAL;
++                      xenbus_dev_fatal(pdev->xdev, err,
++                                       "Error reading PCI device %d", i);
++                      goto out;
++              }
++
++              pci_bus = pci_find_bus(domain, bus);
++              if(!pci_bus) {
++                      dev_dbg(&pdev->xdev->dev, "Cannot get bus %04x:%02x\n",
++                              domain, bus);
++                      continue;
++              }
++              pci_dev = pci_get_slot(pci_bus, PCI_DEVFN(slot, func));
++              if(!pci_dev) {
++                      dev_dbg(&pdev->xdev->dev,
++                              "Cannot get PCI device %04x:%02x:%02x.%02x\n",
++                              domain, bus, slot, func);
++                      continue;
++              }
++              pci_remove_bus_device(pci_dev);
++              pci_dev_put(pci_dev);
++
++              dev_dbg(&pdev->xdev->dev,
++                      "PCI device %04x:%02x:%02x.%02x removed.\n",
++                      domain, bus, slot, func);
++      }
++
++      err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring);
++
++      out:
++      spin_unlock(&pdev->dev_lock);
++      return err;
++}
++
++static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev,
++                                                enum xenbus_state be_state)
++{
++      struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
++
++      switch (be_state) {
++      case XenbusStateUnknown:
++      case XenbusStateInitialising:
++      case XenbusStateInitWait:
++      case XenbusStateInitialised:
++      case XenbusStateClosed:
++              break;
++
++      case XenbusStateConnected:
++              pcifront_try_connect(pdev);
++              break;
++
++      case XenbusStateClosing:
++              dev_warn(&xdev->dev, "backend going away!\n");
++              pcifront_try_disconnect(pdev);
++              break;
++
++      case XenbusStateReconfiguring:
++              pcifront_detach_devices(pdev);
++              break;
++
++      case XenbusStateReconfigured:
++              pcifront_attach_devices(pdev);
++              break;
++      }
++}
++
++static int pcifront_xenbus_probe(struct xenbus_device *xdev,
++                               const struct xenbus_device_id *id)
++{
++      int err = 0;
++      struct pcifront_device *pdev = alloc_pdev(xdev);
++
++      if (pdev == NULL) {
++              err = -ENOMEM;
++              xenbus_dev_fatal(xdev, err,
++                               "Error allocating pcifront_device struct");
++              goto out;
++      }
++
++      err = pcifront_publish_info(pdev);
++      if (err)
++              free_pdev(pdev);
++
++      out:
++      return err;
++}
++
++static int pcifront_xenbus_remove(struct xenbus_device *xdev)
++{
++      if (dev_get_drvdata(&xdev->dev))
++              free_pdev(dev_get_drvdata(&xdev->dev));
++
++      return 0;
++}
++
++static const struct xenbus_device_id xenpci_ids[] = {
++      {"pci"},
++      {{0}},
++};
++MODULE_ALIAS("xen:pci");
++
++static struct xenbus_driver xenbus_pcifront_driver = {
++      .name                   = "pcifront",
++      .ids                    = xenpci_ids,
++      .probe                  = pcifront_xenbus_probe,
++      .remove                 = pcifront_xenbus_remove,
++      .otherend_changed       = pcifront_backend_changed,
++};
++
++static int __init pcifront_init(void)
++{
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      return xenbus_register_frontend(&xenbus_pcifront_driver);
++}
++
++/* Initialize after the Xen PCI Frontend Stub is initialized */
++subsys_initcall(pcifront_init);
diff --cc drivers/xen/privcmd/Makefile

index 0000000,0000000..507245a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/privcmd/Makefile
@@@ -1,0 -1,0 +1,3 @@@
++priv-$(CONFIG_COMPAT) := compat_privcmd.o
++obj-y := privcmd.o
++obj-$(CONFIG_XEN_PRIVILEGED_GUEST) += $(priv-y)
diff --cc drivers/xen/privcmd/compat_privcmd.c

index 0000000,0000000..3e5c077

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/privcmd/compat_privcmd.c
@@@ -1,0 -1,0 +1,140 @@@
++/*
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
++ *
++ * Copyright (C) IBM Corp. 2006
++ *
++ * Authors: Jimi Xenidis <jimix@watson.ibm.com>
++ */
++
++#include <linux/compat.h>
++#include <linux/ioctl.h>
++#include <linux/syscalls.h>
++#include <asm/hypervisor.h>
++#include <asm/uaccess.h>
++#include <xen/public/privcmd.h>
++#include <xen/compat_ioctl.h>
++
++int privcmd_ioctl_32(int fd, unsigned int cmd, void __user *arg)
++{
++      int ret;
++
++      switch (cmd) {
++      case IOCTL_PRIVCMD_MMAP_32: {
++              struct privcmd_mmap __user *p;
++              struct privcmd_mmap_32 __user *p32 = arg;
++              struct privcmd_mmap_32 n32;
++
++              p = compat_alloc_user_space(sizeof(*p));
++              if (copy_from_user(&n32, p32, sizeof(n32)) ||
++                  put_user(n32.num, &p->num) ||
++                  put_user(n32.dom, &p->dom) ||
++                  put_user(compat_ptr(n32.entry), &p->entry))
++                      return -EFAULT;
++              
++              ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAP, (unsigned long)p);
++      }
++              break;
++      case IOCTL_PRIVCMD_MMAPBATCH_32: {
++              struct privcmd_mmapbatch __user *p;
++              struct privcmd_mmapbatch_32 __user *p32 = arg;
++              struct privcmd_mmapbatch_32 n32;
++#ifdef xen_pfn32_t
++              xen_pfn_t *__user arr;
++              xen_pfn32_t *__user arr32;
++              unsigned int i;
++#endif
++
++              p = compat_alloc_user_space(sizeof(*p));
++              if (copy_from_user(&n32, p32, sizeof(n32)) ||
++                  put_user(n32.num, &p->num) ||
++                  put_user(n32.dom, &p->dom) ||
++                  put_user(n32.addr, &p->addr))
++                      return -EFAULT;
++#ifdef xen_pfn32_t
++              arr = compat_alloc_user_space(n32.num * sizeof(*arr)
++                                            + sizeof(*p));
++              arr32 = compat_ptr(n32.arr);
++              for (i = 0; i < n32.num; ++i) {
++                      xen_pfn32_t mfn;
++
++                      if (get_user(mfn, arr32 + i) || put_user(mfn, arr + i))
++                              return -EFAULT;
++              }
++
++              if (put_user(arr, &p->arr))
++                      return -EFAULT;
++#else
++              if (put_user(compat_ptr(n32.arr), &p->arr))
++                      return -EFAULT;
++#endif
++              
++              ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAPBATCH, (unsigned long)p);
++
++#ifdef xen_pfn32_t
++              for (i = 0; !ret && i < n32.num; ++i) {
++                      xen_pfn_t mfn;
++
++                      if (get_user(mfn, arr + i) || put_user(mfn, arr32 + i))
++                              ret = -EFAULT;
++                      else if (mfn != (xen_pfn32_t)mfn)
++                              ret = -ERANGE;
++              }
++#endif
++      }
++              break;
++      case IOCTL_PRIVCMD_MMAPBATCH_V2_32: {
++              struct privcmd_mmapbatch_v2 __user *p;
++              struct privcmd_mmapbatch_v2_32 __user *p32 = arg;
++              struct privcmd_mmapbatch_v2_32 n32;
++#ifdef xen_pfn32_t
++              xen_pfn_t *__user arr;
++              const xen_pfn32_t *__user arr32;
++              unsigned int i;
++#endif
++
++              p = compat_alloc_user_space(sizeof(*p));
++              if (copy_from_user(&n32, p32, sizeof(n32)) ||
++                  put_user(n32.num, &p->num) ||
++                  put_user(n32.dom, &p->dom) ||
++                  put_user(n32.addr, &p->addr) ||
++                  put_user(compat_ptr(n32.err), &p->err))
++                      return -EFAULT;
++#ifdef xen_pfn32_t
++              arr = compat_alloc_user_space(n32.num * sizeof(*arr)
++                                            + sizeof(*p));
++              arr32 = compat_ptr(n32.arr);
++              for (i = 0; i < n32.num; ++i) {
++                      xen_pfn32_t mfn;
++
++                      if (get_user(mfn, arr32 + i) || put_user(mfn, arr + i))
++                              return -EFAULT;
++              }
++
++              if (put_user(arr, &p->arr))
++                      return -EFAULT;
++#else
++              if (put_user(compat_ptr(n32.arr), &p->arr))
++                      return -EFAULT;
++#endif
++
++              ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAPBATCH_V2, (unsigned long)p);
++      }
++              break;
++      default:
++              ret = -EINVAL;
++              break;
++      }
++      return ret;
++}
diff --cc drivers/xen/privcmd/privcmd.c

index 0000000,0000000..b4fd083

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/privcmd/privcmd.c
@@@ -1,0 -1,0 +1,475 @@@
++/******************************************************************************
++ * privcmd.c
++ * 
++ * Interface to privileged domain-0 commands.
++ * 
++ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
++ */
++
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/string.h>
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/swap.h>
++#include <linux/highmem.h>
++#include <linux/pagemap.h>
++#include <linux/seq_file.h>
++#include <asm/hypervisor.h>
++
++#include <asm/pgalloc.h>
++#include <asm/pgtable.h>
++#include <asm/uaccess.h>
++#include <asm/tlb.h>
++#include <asm/hypervisor.h>
++#include <xen/public/privcmd.h>
++#include <xen/interface/xen.h>
++#include <xen/xen_proc.h>
++#include <xen/features.h>
++
++static struct proc_dir_entry *privcmd_intf;
++static struct proc_dir_entry *capabilities_intf;
++
++#ifndef CONFIG_XEN_PRIVILEGED_GUEST
++#define HAVE_ARCH_PRIVCMD_MMAP
++#endif
++#ifndef HAVE_ARCH_PRIVCMD_MMAP
++static int enforce_singleshot_mapping_fn(pte_t *pte, struct page *pmd_page,
++                                       unsigned long addr, void *data)
++{
++      return pte_none(*pte) ? 0 : -EBUSY;
++}
++
++static inline int enforce_singleshot_mapping(struct vm_area_struct *vma,
++                                           unsigned long addr,
++                                           unsigned long npages)
++{
++      return apply_to_page_range(vma->vm_mm, addr, npages << PAGE_SHIFT,
++                                 enforce_singleshot_mapping_fn, NULL) == 0;
++}
++#else
++#define enforce_singleshot_mapping(vma, addr, npages) \
++      privcmd_enforce_singleshot_mapping(vma)
++#endif
++
++static long privcmd_ioctl(struct file *file,
++                        unsigned int cmd, unsigned long data)
++{
++      long ret;
++      void __user *udata = (void __user *) data;
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++      unsigned long i, addr, nr, nr_pages;
++      int paged_out;
++      struct mm_struct *mm = current->mm;
++      struct vm_area_struct *vma;
++      LIST_HEAD(pagelist);
++      struct list_head *l, *l2;
++#endif
++
++      switch (cmd) {
++      case IOCTL_PRIVCMD_HYPERCALL: {
++              privcmd_hypercall_t hypercall;
++  
++              if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
++                      return -EFAULT;
++
++#ifdef CONFIG_X86
++              ret = -ENOSYS;
++              if (hypercall.op >= (PAGE_SIZE >> 5))
++                      break;
++              ret = _hypercall(long, (unsigned int)hypercall.op,
++                               (unsigned long)hypercall.arg[0],
++                               (unsigned long)hypercall.arg[1],
++                               (unsigned long)hypercall.arg[2],
++                               (unsigned long)hypercall.arg[3],
++                               (unsigned long)hypercall.arg[4]);
++#else
++              ret = privcmd_hypercall(&hypercall);
++#endif
++      }
++      break;
++
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++
++      case IOCTL_PRIVCMD_MMAP: {
++#define MMAP_NR_PER_PAGE \
++      (unsigned long)((PAGE_SIZE - sizeof(*l)) / sizeof(*msg))
++              privcmd_mmap_t mmapcmd;
++              privcmd_mmap_entry_t *msg;
++              privcmd_mmap_entry_t __user *p;
++
++              if (!is_initial_xendomain())
++                      return -EPERM;
++
++              if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
++                      return -EFAULT;
++
++              if (mmapcmd.num <= 0)
++                      return -EINVAL;
++
++              p = mmapcmd.entry;
++              for (i = 0; i < mmapcmd.num;) {
++                      nr = min(mmapcmd.num - i, MMAP_NR_PER_PAGE);
++
++                      ret = -ENOMEM;
++                      l = (struct list_head *) __get_free_page(GFP_KERNEL);
++                      if (l == NULL)
++                              goto mmap_out;
++
++                      INIT_LIST_HEAD(l);
++                      list_add_tail(l, &pagelist);
++                      msg = (privcmd_mmap_entry_t*)(l + 1);
++
++                      ret = -EFAULT;
++                      if (copy_from_user(msg, p, nr*sizeof(*msg)))
++                              goto mmap_out;
++                      i += nr;
++                      p += nr;
++              }
++
++              l = pagelist.next;
++              msg = (privcmd_mmap_entry_t*)(l + 1);
++
++              down_write(&mm->mmap_sem);
++
++              vma = find_vma(mm, msg->va);
++              ret = -EINVAL;
++              if (!vma || (msg->va != vma->vm_start))
++                      goto mmap_out;
++
++              addr = vma->vm_start;
++
++              i = 0;
++              list_for_each(l, &pagelist) {
++                      nr = i + min(mmapcmd.num - i, MMAP_NR_PER_PAGE);
++
++                      msg = (privcmd_mmap_entry_t*)(l + 1);
++                      while (i<nr) {
++
++                              /* Do not allow range to wrap the address space. */
++                              if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
++                                  (((unsigned long)msg->npages << PAGE_SHIFT) >= -addr))
++                                      goto mmap_out;
++
++                              /* Range chunks must be contiguous in va space. */
++                              if ((msg->va != addr) ||
++                                  ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
++                                      goto mmap_out;
++
++                              addr += msg->npages << PAGE_SHIFT;
++                              msg++;
++                              i++;
++                      }
++              }
++
++              if (!enforce_singleshot_mapping(vma, vma->vm_start,
++                                              (addr - vma->vm_start) >> PAGE_SHIFT))
++                      goto mmap_out;
++
++              addr = vma->vm_start;
++              i = 0;
++              list_for_each(l, &pagelist) {
++                      nr = i + min(mmapcmd.num - i, MMAP_NR_PER_PAGE);
++
++                      msg = (privcmd_mmap_entry_t*)(l + 1);
++                      while (i < nr) {
++                              if ((ret = direct_remap_pfn_range(
++                                           vma,
++                                           msg->va & PAGE_MASK,
++                                           msg->mfn,
++                                           msg->npages << PAGE_SHIFT,
++                                           vma->vm_page_prot,
++                                           mmapcmd.dom)) < 0)
++                                      goto mmap_out;
++
++                              addr += msg->npages << PAGE_SHIFT;
++                              msg++;
++                              i++;
++                      }
++              }
++
++              ret = 0;
++
++      mmap_out:
++              up_write(&mm->mmap_sem);
++              list_for_each_safe(l,l2,&pagelist)
++                      free_page((unsigned long)l);
++      }
++#undef MMAP_NR_PER_PAGE
++      break;
++
++      case IOCTL_PRIVCMD_MMAPBATCH: {
++#define MMAPBATCH_NR_PER_PAGE \
++      (unsigned long)((PAGE_SIZE - sizeof(*l)) / sizeof(*mfn))
++              privcmd_mmapbatch_t m;
++              xen_pfn_t __user *p;
++              xen_pfn_t *mfn;
++
++              if (!is_initial_xendomain())
++                      return -EPERM;
++
++              if (copy_from_user(&m, udata, sizeof(m)))
++                      return -EFAULT;
++
++              nr_pages = m.num;
++              addr = m.addr;
++              if (m.num <= 0 || nr_pages > (LONG_MAX >> PAGE_SHIFT) ||
++                  addr != m.addr || nr_pages > (-addr >> PAGE_SHIFT))
++                      return -EINVAL;
++
++              p = m.arr;
++              for (i=0; i<nr_pages; ) {
++                      nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
++
++                      ret = -ENOMEM;
++                      l = (struct list_head *)__get_free_page(GFP_KERNEL);
++                      if (l == NULL)
++                              goto mmapbatch_out;
++
++                      INIT_LIST_HEAD(l);
++                      list_add_tail(l, &pagelist);
++
++                      mfn = (unsigned long*)(l + 1);
++                      ret = -EFAULT;
++                      if (copy_from_user(mfn, p, nr*sizeof(*mfn)))
++                              goto mmapbatch_out;
++
++                      i += nr; p+= nr;
++              }
++
++              down_write(&mm->mmap_sem);
++
++              vma = find_vma(mm, addr);
++              ret = -EINVAL;
++              if (!vma ||
++                  addr < vma->vm_start ||
++                  addr + (nr_pages << PAGE_SHIFT) > vma->vm_end ||
++                  !enforce_singleshot_mapping(vma, addr, nr_pages)) {
++                      up_write(&mm->mmap_sem);
++                      goto mmapbatch_out;
++              }
++
++              i = 0;
++              ret = 0;
++              paged_out = 0;
++              list_for_each(l, &pagelist) {
++                      nr = i + min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
++                      mfn = (unsigned long *)(l + 1);
++
++                      while (i<nr) {
++                              int rc;
++
++                              rc = direct_remap_pfn_range(vma, addr & PAGE_MASK,
++                                                          *mfn, PAGE_SIZE,
++                                                          vma->vm_page_prot, m.dom);
++                              if(rc < 0) {
++                                      if (rc == -ENOENT)
++                                      {
++                                              *mfn |= 0x80000000U;
++                                              paged_out = 1;
++                                      }
++                                      else
++                                              *mfn |= 0xf0000000U;
++                                      ret++;
++                              }
++                              mfn++; i++; addr += PAGE_SIZE;
++                      }
++              }
++
++              up_write(&mm->mmap_sem);
++              if (ret > 0) {
++                      p = m.arr;
++                      i = 0;
++                      if (paged_out)
++                              ret = -ENOENT;
++                      else
++                              ret = 0;
++                      list_for_each(l, &pagelist) {
++                              nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
++                              mfn = (unsigned long *)(l + 1);
++                              if (copy_to_user(p, mfn, nr*sizeof(*mfn)))
++                                      ret = -EFAULT;
++                              i += nr; p += nr;
++                      }
++              }
++      mmapbatch_out:
++              list_for_each_safe(l,l2,&pagelist)
++                      free_page((unsigned long)l);
++      }
++      break;
++
++      case IOCTL_PRIVCMD_MMAPBATCH_V2: {
++              privcmd_mmapbatch_v2_t m;
++              const xen_pfn_t __user *p;
++              xen_pfn_t *mfn;
++              int *err;
++
++              if (!is_initial_xendomain())
++                      return -EPERM;
++
++              if (copy_from_user(&m, udata, sizeof(m)))
++                      return -EFAULT;
++
++              nr_pages = m.num;
++              addr = m.addr;
++              if (m.num <= 0 || nr_pages > (ULONG_MAX >> PAGE_SHIFT) ||
++                  addr != m.addr || nr_pages > (-addr >> PAGE_SHIFT))
++                      return -EINVAL;
++
++              p = m.arr;
++              for (i = 0; i < nr_pages; i += nr, p += nr) {
++                      nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
++
++                      ret = -ENOMEM;
++                      l = (struct list_head *)__get_free_page(GFP_KERNEL);
++                      if (l == NULL)
++                              goto mmapbatch_v2_out;
++
++                      INIT_LIST_HEAD(l);
++                      list_add_tail(l, &pagelist);
++
++                      mfn = (void *)(l + 1);
++                      ret = -EFAULT;
++                      if (copy_from_user(mfn, p, nr * sizeof(*mfn)))
++                              goto mmapbatch_v2_out;
++              }
++
++              down_write(&mm->mmap_sem);
++
++              vma = find_vma(mm, addr);
++              ret = -EINVAL;
++              if (!vma ||
++                  addr < vma->vm_start ||
++                  addr + (nr_pages << PAGE_SHIFT) > vma->vm_end ||
++                  !enforce_singleshot_mapping(vma, addr, nr_pages)) {
++                      up_write(&mm->mmap_sem);
++                      goto mmapbatch_v2_out;
++              }
++
++              i = 0;
++              ret = 0;
++              paged_out = 0;
++              list_for_each(l, &pagelist) {
++                      nr = i + min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
++                      mfn = (void *)(l + 1);
++                      err = (void *)(l + 1);
++                      BUILD_BUG_ON(sizeof(*err) > sizeof(*mfn));
++
++                      while (i < nr) {
++                              int rc;
++
++                              rc = direct_remap_pfn_range(vma, addr & PAGE_MASK,
++                                                          *mfn, PAGE_SIZE,
++                                                          vma->vm_page_prot, m.dom);
++                              if (rc < 0) {
++                                      if (rc == -ENOENT)
++                                              paged_out = 1;
++                                      ret++;
++                              } else
++                                      BUG_ON(rc > 0);
++                              *err++ = rc;
++                              mfn++; i++; addr += PAGE_SIZE;
++                      }
++              }
++
++              up_write(&mm->mmap_sem);
++
++              if (ret > 0) {
++                      int __user *p = m.err;
++
++                      ret = paged_out ? -ENOENT : 0;
++                      i = 0;
++                      list_for_each(l, &pagelist) {
++                              nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
++                              err = (void *)(l + 1);
++                              if (copy_to_user(p, err, nr * sizeof(*err)))
++                                      ret = -EFAULT;
++                              i += nr; p += nr;
++                      }
++              } else if (clear_user(m.err, nr_pages * sizeof(*m.err)))
++                      ret = -EFAULT;
++
++      mmapbatch_v2_out:
++              list_for_each_safe(l, l2, &pagelist)
++                      free_page((unsigned long)l);
++#undef MMAPBATCH_NR_PER_PAGE
++      }
++      break;
++
++#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
++
++      default:
++              ret = -EINVAL;
++              break;
++      }
++
++      return ret;
++}
++
++#ifndef HAVE_ARCH_PRIVCMD_MMAP
++static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++      return VM_FAULT_SIGBUS;
++}
++
++static struct vm_operations_struct privcmd_vm_ops = {
++      .fault = privcmd_fault
++};
++
++static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
++{
++      /* Unsupported for auto-translate guests. */
++      if (xen_feature(XENFEAT_auto_translated_physmap))
++              return -ENOSYS;
++
++      /* DONTCOPY is essential for Xen because copy_page_range doesn't know
++       * how to recreate these mappings */
++      vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTCOPY;
++      vma->vm_ops = &privcmd_vm_ops;
++      vma->vm_private_data = NULL;
++
++      return 0;
++}
++#endif
++
++static const struct file_operations privcmd_file_ops = {
++      .open = nonseekable_open,
++      .llseek = no_llseek,
++      .unlocked_ioctl = privcmd_ioctl,
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++      .mmap = privcmd_mmap,
++#endif
++};
++
++static int capabilities_read(char *page, char **start, off_t off,
++                           int count, int *eof, void *data)
++{
++      int len = 0;
++      *page = 0;
++
++      if (is_initial_xendomain())
++              len = sprintf( page, "control_d\n" );
++
++      *eof = 1;
++      return len;
++}
++
++static int __init privcmd_init(void)
++{
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      privcmd_intf = create_xen_proc_entry("privcmd", 0400);
++      if (privcmd_intf != NULL)
++              privcmd_intf->proc_fops = &privcmd_file_ops;
++
++      capabilities_intf = create_xen_proc_entry("capabilities", 0400 );
++      if (capabilities_intf != NULL)
++              capabilities_intf->read_proc = capabilities_read;
++
++      return 0;
++}
++
++__initcall(privcmd_init);
diff --cc drivers/xen/scsiback/Makefile

index 0000000,0000000..56271df

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/scsiback/Makefile
@@@ -1,0 -1,0 +1,4 @@@
++obj-$(CONFIG_XEN_SCSI_BACKEND) := xen-scsibk.o
++
++xen-scsibk-y  := interface.o scsiback.o xenbus.o translate.o emulate.o
++
diff --cc drivers/xen/scsiback/common.h

index 0000000,0000000..1519f89

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/scsiback/common.h
@@@ -1,0 -1,0 +1,175 @@@
++/*
++ * Copyright (c) 2008, FUJITSU Limited
++ *
++ * Based on the blkback driver code.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __SCSIIF__BACKEND__COMMON_H__
++#define __SCSIIF__BACKEND__COMMON_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/wait.h>
++#include <linux/sched.h>
++#include <linux/kthread.h>
++#include <linux/blkdev.h>
++#include <linux/list.h>
++#include <linux/kthread.h>
++#include <scsi/scsi.h>
++#include <scsi/scsi_cmnd.h>
++#include <scsi/scsi_host.h>
++#include <scsi/scsi_device.h>
++#include <scsi/scsi_dbg.h>
++#include <scsi/scsi_eh.h>
++#include <asm/hypervisor.h>
++#include <xen/xenbus.h>
++#include <xen/interface/io/ring.h>
++#include <xen/interface/io/vscsiif.h>
++
++
++#define DPRINTK(_f, _a...)                    \
++      pr_debug("(file=%s, line=%d) " _f,      \
++               __FILE__ , __LINE__ , ## _a )
++
++struct ids_tuple {
++      unsigned int hst;               /* host    */
++      unsigned int chn;               /* channel */
++      unsigned int tgt;               /* target  */
++      unsigned int lun;               /* LUN     */
++};
++
++struct v2p_entry {
++      struct ids_tuple v;             /* translate from */
++      struct scsi_device *sdev;       /* translate to   */
++      struct list_head l;
++};
++
++struct vscsibk_info {
++      struct xenbus_device *dev;
++
++      domid_t domid;
++      unsigned int evtchn;
++      unsigned int irq;
++
++      int feature;
++
++      struct vscsiif_back_ring  ring;
++      struct vm_struct *ring_area;
++
++      spinlock_t ring_lock;
++      atomic_t nr_unreplied_reqs;
++
++      spinlock_t v2p_lock;
++      struct list_head v2p_entry_lists;
++
++      struct task_struct *kthread;
++      wait_queue_head_t waiting_to_free;
++      wait_queue_head_t wq;
++      unsigned int waiting_reqs;
++      struct page **mmap_pages;
++
++};
++
++typedef struct {
++      unsigned char act;
++      struct vscsibk_info *info;
++      struct scsi_device *sdev;
++
++      uint16_t rqid;
++      
++      uint16_t v_chn, v_tgt;
++
++      uint8_t nr_segments;
++      uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];
++      uint8_t cmd_len;
++
++      uint8_t sc_data_direction;
++      uint16_t timeout_per_command;
++      
++      uint32_t request_bufflen;
++      struct scatterlist *sgl;
++      grant_ref_t gref[VSCSIIF_SG_TABLESIZE];
++
++      int32_t rslt;
++      uint32_t resid;
++      uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
++
++      struct list_head free_list;
++} pending_req_t;
++
++
++
++#define scsiback_get(_b) (atomic_inc(&(_b)->nr_unreplied_reqs))
++#define scsiback_put(_b)                              \
++      do {                                            \
++              if (atomic_dec_and_test(&(_b)->nr_unreplied_reqs))      \
++                      wake_up(&(_b)->waiting_to_free);\
++      } while (0)
++
++#define VSCSIIF_TIMEOUT               (900*HZ)
++
++#define VSCSI_TYPE_HOST               1
++
++irqreturn_t scsiback_intr(int, void *);
++int scsiback_init_sring(struct vscsibk_info *, grant_ref_t, evtchn_port_t);
++int scsiback_schedule(void *data);
++
++
++struct vscsibk_info *vscsibk_info_alloc(domid_t domid);
++void scsiback_free(struct vscsibk_info *info);
++void scsiback_disconnect(struct vscsibk_info *);
++int __init scsiback_interface_init(void);
++void scsiback_interface_exit(void);
++int scsiback_xenbus_init(void);
++void scsiback_xenbus_unregister(void);
++
++void scsiback_init_translation_table(struct vscsibk_info *info);
++
++int scsiback_add_translation_entry(struct vscsibk_info *info,
++                      struct scsi_device *sdev, struct ids_tuple *v);
++
++int scsiback_del_translation_entry(struct vscsibk_info *info,
++                              struct ids_tuple *v);
++struct scsi_device *scsiback_do_translation(struct vscsibk_info *info,
++                      struct ids_tuple *v);
++void scsiback_release_translation_entry(struct vscsibk_info *info);
++
++
++void scsiback_cmd_exec(pending_req_t *pending_req);
++void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result,
++                      uint32_t resid, pending_req_t *pending_req);
++void scsiback_fast_flush_area(pending_req_t *req);
++
++void scsiback_rsp_emulation(pending_req_t *pending_req);
++void scsiback_req_emulation_or_cmdexec(pending_req_t *pending_req);
++void scsiback_emulation_init(void);
++
++
++#endif /* __SCSIIF__BACKEND__COMMON_H__ */
diff --cc drivers/xen/scsiback/emulate.c

index 0000000,0000000..fb4f7b1

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/scsiback/emulate.c
@@@ -1,0 -1,0 +1,479 @@@
++/*
++ * Xen SCSI backend driver
++ *
++ * Copyright (c) 2008, FUJITSU Limited
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++/*
++* Patched to support >2TB drives + allow tape & autoloader operations
++* 2010, Samuel Kvasnica, IMS Nanofabrication AG
++*/
++
++#include <scsi/scsi.h>
++#include <scsi/scsi_cmnd.h>
++#include <scsi/scsi_device.h>
++#include "common.h"
++
++/* Following SCSI commands are not defined in scsi/scsi.h */
++#define EXTENDED_COPY         0x83    /* EXTENDED COPY command        */
++#define REPORT_ALIASES                0xa3    /* REPORT ALIASES command       */
++#define CHANGE_ALIASES                0xa4    /* CHANGE ALIASES command       */
++#define SET_PRIORITY          0xa4    /* SET PRIORITY command         */
++
++
++/*
++  The bitmap in order to control emulation.
++  (Bit 3 to 7 are reserved for future use.)
++*/
++#define VSCSIIF_NEED_CMD_EXEC         0x01    /* If this bit is set, cmd exec */
++                                              /* is required.                 */
++#define VSCSIIF_NEED_EMULATE_REQBUF   0x02    /* If this bit is set, need     */
++                                              /* emulation reqest buff before */
++                                              /* cmd exec.                    */
++#define VSCSIIF_NEED_EMULATE_RSPBUF   0x04    /* If this bit is set, need     */
++                                              /* emulation resp buff after    */
++                                              /* cmd exec.                    */
++
++/* Additional Sense Code (ASC) used */
++#define NO_ADDITIONAL_SENSE           0x0
++#define LOGICAL_UNIT_NOT_READY                0x4
++#define UNRECOVERED_READ_ERR          0x11
++#define PARAMETER_LIST_LENGTH_ERR     0x1a
++#define INVALID_OPCODE                        0x20
++#define ADDR_OUT_OF_RANGE             0x21
++#define INVALID_FIELD_IN_CDB          0x24
++#define INVALID_FIELD_IN_PARAM_LIST   0x26
++#define POWERON_RESET                 0x29
++#define SAVING_PARAMS_UNSUP           0x39
++#define THRESHOLD_EXCEEDED            0x5d
++#define LOW_POWER_COND_ON             0x5e
++
++
++
++/* Number os SCSI op_code     */
++#define VSCSI_MAX_SCSI_OP_CODE                256
++static unsigned char bitmap[VSCSI_MAX_SCSI_OP_CODE];
++
++#define NO_EMULATE(cmd) \
++      bitmap[cmd] = VSCSIIF_NEED_CMD_EXEC; \
++      pre_function[cmd] = NULL; \
++      post_function[cmd] = NULL
++
++
++
++/*
++  Emulation routines for each SCSI op_code.
++*/
++static void (*pre_function[VSCSI_MAX_SCSI_OP_CODE])(pending_req_t *, void *);
++static void (*post_function[VSCSI_MAX_SCSI_OP_CODE])(pending_req_t *, void *);
++
++
++static const int check_condition_result =
++              (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;
++
++static void scsiback_mk_sense_buffer(uint8_t *data, uint8_t key,
++                      uint8_t asc, uint8_t asq)
++{
++      data[0] = 0x70;  /* fixed, current */
++      data[2] = key;
++      data[7] = 0xa;    /* implies 18 byte sense buffer */
++      data[12] = asc;
++      data[13] = asq;
++}
++
++static void resp_not_supported_cmd(pending_req_t *pending_req, void *data)
++{
++      scsiback_mk_sense_buffer(pending_req->sense_buffer, ILLEGAL_REQUEST,
++              INVALID_OPCODE, 0);
++      pending_req->resid = 0;
++      pending_req->rslt  = check_condition_result;
++}
++
++
++static int __copy_to_sg(struct scatterlist *sgl, unsigned int nr_sg,
++             void *buf, unsigned int buflen)
++{
++      struct scatterlist *sg;
++      void *from = buf;
++      void *to;
++      unsigned int from_rest = buflen;
++      unsigned int to_capa;
++      unsigned int copy_size = 0;
++      unsigned int i;
++      unsigned long pfn;
++
++      for_each_sg (sgl, sg, nr_sg, i) {
++              if (sg_page(sg) == NULL) {
++                      pr_warning("%s: inconsistent length field in "
++                                 "scatterlist\n", __FUNCTION__);
++                      return -ENOMEM;
++              }
++
++              to_capa  = sg->length;
++              copy_size = min_t(unsigned int, to_capa, from_rest);
++
++              pfn = page_to_pfn(sg_page(sg));
++              to = pfn_to_kaddr(pfn) + (sg->offset);
++              memcpy(to, from, copy_size);
++
++              from_rest  -= copy_size;
++              if (from_rest == 0) {
++                      return 0;
++              }
++              
++              from += copy_size;
++      }
++
++      pr_warning("%s: no space in scatterlist\n", __FUNCTION__);
++      return -ENOMEM;
++}
++
++static int __copy_from_sg(struct scatterlist *sgl, unsigned int nr_sg,
++               void *buf, unsigned int buflen)
++{
++      struct scatterlist *sg;
++      void *from;
++      void *to = buf;
++      unsigned int from_rest;
++      unsigned int to_capa = buflen;
++      unsigned int copy_size;
++      unsigned int i;
++      unsigned long pfn;
++
++      for_each_sg (sgl, sg, nr_sg, i) {
++              if (sg_page(sg) == NULL) {
++                      pr_warning("%s: inconsistent length field in "
++                                 "scatterlist\n", __FUNCTION__);
++                      return -ENOMEM;
++              }
++
++              from_rest = sg->length;
++              if ((from_rest > 0) && (to_capa < from_rest)) {
++                      pr_warning("%s: no space in destination buffer\n",
++                                 __FUNCTION__);
++                      return -ENOMEM;
++              }
++              copy_size = from_rest;
++
++              pfn = page_to_pfn(sg_page(sg));
++              from = pfn_to_kaddr(pfn) + (sg->offset);
++              memcpy(to, from, copy_size);
++
++              to_capa  -= copy_size;
++              to += copy_size;
++      }
++
++      return 0;
++}
++
++static int __nr_luns_under_host(struct vscsibk_info *info)
++{
++      struct v2p_entry *entry;
++      struct list_head *head = &(info->v2p_entry_lists);
++      unsigned long flags;
++      int lun_cnt = 0;
++
++      spin_lock_irqsave(&info->v2p_lock, flags);
++      list_for_each_entry(entry, head, l) {
++                      lun_cnt++;
++      }
++      spin_unlock_irqrestore(&info->v2p_lock, flags);
++
++      return (lun_cnt);
++}
++
++
++/* REPORT LUNS Define*/
++#define VSCSI_REPORT_LUNS_HEADER      8
++#define VSCSI_REPORT_LUNS_RETRY               3
++
++/* quoted scsi_debug.c/resp_report_luns() */
++static void __report_luns(pending_req_t *pending_req, void *data)
++{
++      struct vscsibk_info *info   = pending_req->info;
++      unsigned int        channel = pending_req->v_chn;
++      unsigned int        target  = pending_req->v_tgt;
++      unsigned int        nr_seg  = pending_req->nr_segments;
++      unsigned char *cmd = (unsigned char *)pending_req->cmnd;
++      
++      unsigned char *buff = NULL;
++      unsigned char alloc_len;
++      unsigned int alloc_luns = 0;
++      unsigned int req_bufflen = 0;
++      unsigned int actual_len = 0;
++      unsigned int retry_cnt = 0;
++      int select_report = (int)cmd[2];
++      int i, lun_cnt = 0, lun, upper, err = 0;
++      
++      struct v2p_entry *entry;
++      struct list_head *head = &(info->v2p_entry_lists);
++      unsigned long flags;
++      
++      struct scsi_lun *one_lun;
++
++      req_bufflen = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24);
++      if ((req_bufflen < 4) || (select_report != 0))
++              goto fail;
++
++      alloc_luns = __nr_luns_under_host(info);
++      alloc_len  = sizeof(struct scsi_lun) * alloc_luns
++                              + VSCSI_REPORT_LUNS_HEADER;
++retry:
++      if ((buff = kzalloc(alloc_len, GFP_KERNEL)) == NULL) {
++              pr_err("scsiback:%s kmalloc err\n", __FUNCTION__);
++              goto fail;
++      }
++
++      one_lun = (struct scsi_lun *) &buff[8];
++      spin_lock_irqsave(&info->v2p_lock, flags);
++      list_for_each_entry(entry, head, l) {
++              if ((entry->v.chn == channel) &&
++                  (entry->v.tgt == target)) {
++                      
++                      /* check overflow */
++                      if (lun_cnt >= alloc_luns) {
++                              spin_unlock_irqrestore(&info->v2p_lock,
++                                                      flags);
++
++                              if (retry_cnt < VSCSI_REPORT_LUNS_RETRY) {
++                                      retry_cnt++;
++                                      if (buff)
++                                              kfree(buff);
++                                      goto retry;
++                              }
++
++                              goto fail;
++                      }
++
++                      lun = entry->v.lun;
++                      upper = (lun >> 8) & 0x3f;
++                      if (upper)
++                              one_lun[lun_cnt].scsi_lun[0] = upper;
++                      one_lun[lun_cnt].scsi_lun[1] = lun & 0xff;
++                      lun_cnt++;
++              }
++      }
++
++      spin_unlock_irqrestore(&info->v2p_lock, flags);
++
++      buff[2] = ((sizeof(struct scsi_lun) * lun_cnt) >> 8) & 0xff;
++      buff[3] = (sizeof(struct scsi_lun) * lun_cnt) & 0xff;
++
++      actual_len = lun_cnt * sizeof(struct scsi_lun) 
++                              + VSCSI_REPORT_LUNS_HEADER;
++      req_bufflen = 0;
++      for (i = 0; i < nr_seg; i++)
++              req_bufflen += pending_req->sgl[i].length;
++
++      err = __copy_to_sg(pending_req->sgl, nr_seg, buff, 
++                              min(req_bufflen, actual_len));
++      if (err)
++              goto fail;
++
++      memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE);
++      pending_req->rslt = 0x00;
++      pending_req->resid = req_bufflen - min(req_bufflen, actual_len);
++
++      kfree(buff);
++      return;
++
++fail:
++      scsiback_mk_sense_buffer(pending_req->sense_buffer, ILLEGAL_REQUEST,
++              INVALID_FIELD_IN_CDB, 0);
++      pending_req->rslt  = check_condition_result;
++      pending_req->resid = 0;
++      if (buff)
++              kfree(buff);
++      return;
++}
++
++
++
++int __pre_do_emulation(pending_req_t *pending_req, void *data)
++{
++      uint8_t op_code = pending_req->cmnd[0];
++
++      if ((bitmap[op_code] & VSCSIIF_NEED_EMULATE_REQBUF) &&
++          pre_function[op_code] != NULL) {
++              pre_function[op_code](pending_req, data);
++      }
++
++      /*
++          0: no need for native driver call, so should return immediately.
++          1: non emulation or should call native driver 
++             after modifing the request buffer.
++      */
++      return !!(bitmap[op_code] & VSCSIIF_NEED_CMD_EXEC);
++}
++
++void scsiback_rsp_emulation(pending_req_t *pending_req)
++{
++      uint8_t op_code = pending_req->cmnd[0];
++
++      if ((bitmap[op_code] & VSCSIIF_NEED_EMULATE_RSPBUF) &&
++          post_function[op_code] != NULL) {
++              post_function[op_code](pending_req, NULL);
++      }
++
++      return;
++}
++
++
++void scsiback_req_emulation_or_cmdexec(pending_req_t *pending_req)
++{
++      if (__pre_do_emulation(pending_req, NULL)) {
++              scsiback_cmd_exec(pending_req);
++      }
++      else {
++              scsiback_fast_flush_area(pending_req);
++              scsiback_do_resp_with_sense(pending_req->sense_buffer,
++                pending_req->rslt, pending_req->resid, pending_req);
++      }
++}
++
++
++/*
++  Following are not customizable functions.
++*/
++void scsiback_emulation_init(void)
++{
++      int i;
++
++      /* Initialize to default state */
++      for (i = 0; i < VSCSI_MAX_SCSI_OP_CODE; i++) {
++              bitmap[i]        = (VSCSIIF_NEED_EMULATE_REQBUF | 
++                                      VSCSIIF_NEED_EMULATE_RSPBUF);
++              pre_function[i]  = resp_not_supported_cmd;
++              post_function[i] = NULL;
++              /* means,
++                 - no need for pre-emulation
++                 - no need for post-emulation
++                 - call native driver
++              */
++      }
++
++      /*
++        Register appropriate functions below as you need.
++        (See scsi/scsi.h for definition of SCSI op_code.)
++      */
++
++      /*
++        Following commands do not require emulation.
++      */
++      NO_EMULATE(TEST_UNIT_READY);       /*0x00*/ /* sd,st */
++      NO_EMULATE(REZERO_UNIT);           /*0x01*/ /* st */
++      NO_EMULATE(REQUEST_SENSE);         /*0x03*/
++      NO_EMULATE(FORMAT_UNIT);           /*0x04*/
++      NO_EMULATE(READ_BLOCK_LIMITS);     /*0x05*/ /* st */
++      /*NO_EMULATE(REASSIGN_BLOCKS);       *//*0x07*/
++      NO_EMULATE(INITIALIZE_ELEMENT_STATUS); /*0x07*/ /* ch */
++      NO_EMULATE(READ_6);                /*0x08*/ /* sd,st */
++      NO_EMULATE(WRITE_6);               /*0x0a*/ /* sd,st */
++      NO_EMULATE(SEEK_6);                /*0x0b*/
++      /*NO_EMULATE(READ_REVERSE);          *//*0x0f*/
++      NO_EMULATE(WRITE_FILEMARKS);       /*0x10*/ /* st */
++      NO_EMULATE(SPACE);                 /*0x11*/ /* st */
++      NO_EMULATE(INQUIRY);               /*0x12*/
++      /*NO_EMULATE(RECOVER_BUFFERED_DATA); *//*0x14*/
++      NO_EMULATE(MODE_SELECT);           /*0x15*/ /* st */
++      /*NO_EMULATE(RESERVE);               *//*0x16*/
++      /*NO_EMULATE(RELEASE);               *//*0x17*/
++      /*NO_EMULATE(COPY);                  *//*0x18*/
++      NO_EMULATE(ERASE);                 /*0x19*/ /* st */
++      NO_EMULATE(MODE_SENSE);            /*0x1a*/ /* st */
++      NO_EMULATE(START_STOP);            /*0x1b*/ /* sd,st */
++      NO_EMULATE(RECEIVE_DIAGNOSTIC);    /*0x1c*/
++      NO_EMULATE(SEND_DIAGNOSTIC);       /*0x1d*/
++      NO_EMULATE(ALLOW_MEDIUM_REMOVAL);  /*0x1e*/
++
++      /*NO_EMULATE(SET_WINDOW);            *//*0x24*/
++      NO_EMULATE(READ_CAPACITY);         /*0x25*/ /* sd */
++      NO_EMULATE(READ_10);               /*0x28*/ /* sd */
++      NO_EMULATE(WRITE_10);              /*0x2a*/ /* sd */
++      NO_EMULATE(SEEK_10);               /*0x2b*/ /* st */
++      NO_EMULATE(POSITION_TO_ELEMENT);   /*0x2b*/ /* ch */
++      /*NO_EMULATE(WRITE_VERIFY);          *//*0x2e*/
++      /*NO_EMULATE(VERIFY);                *//*0x2f*/
++      /*NO_EMULATE(SEARCH_HIGH);           *//*0x30*/
++      /*NO_EMULATE(SEARCH_EQUAL);          *//*0x31*/
++      /*NO_EMULATE(SEARCH_LOW);            *//*0x32*/
++      NO_EMULATE(SET_LIMITS);            /*0x33*/
++      NO_EMULATE(PRE_FETCH);             /*0x34*/ /* st! */
++      NO_EMULATE(READ_POSITION);          /*0x34*/ /* st */
++      NO_EMULATE(SYNCHRONIZE_CACHE);      /*0x35*/ /* sd */
++      NO_EMULATE(LOCK_UNLOCK_CACHE);     /*0x36*/
++      NO_EMULATE(READ_DEFECT_DATA);      /*0x37*/
++      NO_EMULATE(MEDIUM_SCAN);           /*0x38*/
++      /*NO_EMULATE(COMPARE);               *//*0x39*/
++      /*NO_EMULATE(COPY_VERIFY);           *//*0x3a*/
++      NO_EMULATE(WRITE_BUFFER);          /*0x3b*/
++      NO_EMULATE(READ_BUFFER);           /*0x3c*/ /* osst */
++      /*NO_EMULATE(UPDATE_BLOCK);          *//*0x3d*/
++      /*NO_EMULATE(READ_LONG);             *//*0x3e*/
++      /*NO_EMULATE(WRITE_LONG);            *//*0x3f*/
++      /*NO_EMULATE(CHANGE_DEFINITION);     *//*0x40*/
++      /*NO_EMULATE(WRITE_SAME);            *//*0x41*/
++      NO_EMULATE(READ_TOC);              /*0x43*/ /* sr */
++      NO_EMULATE(LOG_SELECT);            /*0x4c*/
++      NO_EMULATE(LOG_SENSE);             /*0x4d*/ /* st! */
++      /*NO_EMULATE(MODE_SELECT_10);        *//*0x55*/
++      /*NO_EMULATE(RESERVE_10);            *//*0x56*/
++      /*NO_EMULATE(RELEASE_10);            *//*0x57*/
++      NO_EMULATE(MODE_SENSE_10);         /*0x5a*/ /* scsi_lib */
++      /*NO_EMULATE(PERSISTENT_RESERVE_IN); *//*0x5e*/
++      /*NO_EMULATE(PERSISTENT_RESERVE_OUT); *//*0x5f*/
++      /*           REPORT_LUNS             *//*0xa0*//*Full emulaiton*/
++#ifdef MAINTENANCE_IN
++      NO_EMULATE(MAINTENANCE_IN);           /*0xa3*/ /* IFT alua */
++      NO_EMULATE(MAINTENANCE_OUT);       /*0xa4*/ /* IFT alua */
++#endif
++      NO_EMULATE(MOVE_MEDIUM);           /*0xa5*/ /* ch */
++      NO_EMULATE(EXCHANGE_MEDIUM);       /*0xa6*/ /* ch */
++      /*NO_EMULATE(READ_12);               *//*0xa8*/
++      /*NO_EMULATE(WRITE_12);              *//*0xaa*/
++      /*NO_EMULATE(WRITE_VERIFY_12);       *//*0xae*/
++      /*NO_EMULATE(SEARCH_HIGH_12);        *//*0xb0*/
++      /*NO_EMULATE(SEARCH_EQUAL_12);       *//*0xb1*/
++      /*NO_EMULATE(SEARCH_LOW_12);         *//*0xb2*/
++      NO_EMULATE(READ_ELEMENT_STATUS);   /*0xb8*/ /* ch */
++      NO_EMULATE(SEND_VOLUME_TAG);       /*0xb6*/ /* ch */
++      /*NO_EMULATE(WRITE_LONG_2);          *//*0xea*/
++      NO_EMULATE(READ_16);               /*0x88*/ /* sd >2TB */
++      NO_EMULATE(WRITE_16);              /*0x8a*/ /* sd >2TB */
++      NO_EMULATE(VERIFY_16);             /*0x8f*/
++      NO_EMULATE(SERVICE_ACTION_IN);     /*0x9e*/ /* sd >2TB */
++
++/* st: QFA_REQUEST_BLOCK, QFA_SEEK_BLOCK might be needed ? */
++      /*
++        Following commands require emulation.
++      */
++      pre_function[REPORT_LUNS] = __report_luns;
++      bitmap[REPORT_LUNS] = (VSCSIIF_NEED_EMULATE_REQBUF | 
++                                      VSCSIIF_NEED_EMULATE_RSPBUF);
++
++      return;
++}
diff --cc drivers/xen/scsiback/interface.c

index 0000000,0000000..9098a3c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/scsiback/interface.c
@@@ -1,0 -1,0 +1,141 @@@
++/*
++ * interface management.
++ *
++ * Copyright (c) 2008, FUJITSU Limited
++ *
++ * Based on the blkback driver code.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <scsi/scsi.h>
++#include <scsi/scsi_host.h>
++#include <scsi/scsi_device.h>
++#include "common.h"
++
++#include <xen/evtchn.h>
++#include <linux/kthread.h>
++#include <linux/delay.h>
++#include <linux/vmalloc.h>
++
++
++static struct kmem_cache *scsiback_cachep;
++
++struct vscsibk_info *vscsibk_info_alloc(domid_t domid)
++{
++      struct vscsibk_info *info;
++
++      info = kmem_cache_zalloc(scsiback_cachep, GFP_KERNEL);
++      if (!info)
++              return ERR_PTR(-ENOMEM);
++
++      info->domid = domid;
++      spin_lock_init(&info->ring_lock);
++      atomic_set(&info->nr_unreplied_reqs, 0);
++      init_waitqueue_head(&info->wq);
++      init_waitqueue_head(&info->waiting_to_free);
++
++      return info;
++}
++
++int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref,
++                      evtchn_port_t evtchn)
++{
++      struct vm_struct *area;
++      struct vscsiif_sring *sring;
++      int err;
++
++      if (info->irq) {
++              pr_err("scsiback: Already connected through?\n");
++              return -1;
++      }
++
++      area = xenbus_map_ring_valloc(info->dev, ring_ref);
++      if (IS_ERR(area))
++              return PTR_ERR(area);
++      info->ring_area = area;
++
++      sring = (struct vscsiif_sring *)area->addr;
++      BACK_RING_INIT(&info->ring, sring, PAGE_SIZE);
++
++      err = bind_interdomain_evtchn_to_irqhandler(
++                      info->domid, evtchn,
++                      scsiback_intr, 0, "vscsiif-backend", info);
++
++      if (err < 0)
++              goto unmap_page;
++              
++      info->irq = err;
++
++      return 0;
++
++unmap_page:
++      xenbus_unmap_ring_vfree(info->dev, area);
++
++      return err;
++}
++
++void scsiback_disconnect(struct vscsibk_info *info)
++{
++      if (info->kthread) {
++              kthread_stop(info->kthread);
++              info->kthread = NULL;
++      }
++
++      wait_event(info->waiting_to_free, 
++              atomic_read(&info->nr_unreplied_reqs) == 0);
++
++      if (info->irq) {
++              unbind_from_irqhandler(info->irq, info);
++              info->irq = 0;
++      }
++
++      if (info->ring.sring) {
++              xenbus_unmap_ring_vfree(info->dev, info->ring_area);
++              info->ring.sring = NULL;
++      }
++}
++
++void scsiback_free(struct vscsibk_info *info)
++{
++      kmem_cache_free(scsiback_cachep, info);
++}
++
++int __init scsiback_interface_init(void)
++{
++      scsiback_cachep = kmem_cache_create("vscsiif_cache",
++              sizeof(struct vscsibk_info), 0, 0, NULL);
++      if (!scsiback_cachep) {
++              pr_err("scsiback: can't init scsi cache\n");
++              return -ENOMEM;
++      }
++      
++      return 0;
++}
++
++void scsiback_interface_exit(void)
++{
++      kmem_cache_destroy(scsiback_cachep);
++}
diff --cc drivers/xen/scsiback/scsiback.c

index 0000000,0000000..13ec29c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/scsiback/scsiback.c
@@@ -1,0 -1,0 +1,731 @@@
++/*
++ * Xen SCSI backend driver
++ *
++ * Copyright (c) 2008, FUJITSU Limited
++ *
++ * Based on the blkback driver code.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/spinlock.h>
++#include <linux/kthread.h>
++#include <linux/list.h>
++#include <linux/delay.h>
++#include <xen/balloon.h>
++#include <xen/evtchn.h>
++#include <xen/gnttab.h>
++#include <asm/hypervisor.h>
++#include <scsi/scsi.h>
++#include <scsi/scsi_cmnd.h>
++#include <scsi/scsi_host.h>
++#include <scsi/scsi_device.h>
++#include <scsi/scsi_dbg.h>
++#include <scsi/scsi_eh.h>
++
++#include "common.h"
++
++
++struct list_head pending_free;
++DEFINE_SPINLOCK(pending_free_lock);
++DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
++
++int vscsiif_reqs = VSCSIIF_BACK_MAX_PENDING_REQS;
++module_param_named(reqs, vscsiif_reqs, int, 0);
++MODULE_PARM_DESC(reqs, "Number of scsiback requests to allocate");
++
++static unsigned int log_print_stat = 0;
++module_param(log_print_stat, int, 0644);
++
++#define SCSIBACK_INVALID_HANDLE (~0)
++
++static pending_req_t *pending_reqs;
++static struct page **pending_pages;
++static grant_handle_t *pending_grant_handles;
++
++static int vaddr_pagenr(pending_req_t *req, int seg)
++{
++      return (req - pending_reqs) * VSCSIIF_SG_TABLESIZE + seg;
++}
++
++static unsigned long vaddr(pending_req_t *req, int seg)
++{
++      unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
++      return (unsigned long)pfn_to_kaddr(pfn);
++}
++
++#define pending_handle(_req, _seg) \
++      (pending_grant_handles[vaddr_pagenr(_req, _seg)])
++
++
++void scsiback_fast_flush_area(pending_req_t *req)
++{
++      struct gnttab_unmap_grant_ref unmap[VSCSIIF_SG_TABLESIZE];
++      unsigned int i, invcount = 0;
++      grant_handle_t handle;
++      int err;
++
++      if (req->nr_segments) {
++              for (i = 0; i < req->nr_segments; i++) {
++                      handle = pending_handle(req, i);
++                      if (handle == SCSIBACK_INVALID_HANDLE)
++                              continue;
++                      gnttab_set_unmap_op(&unmap[i], vaddr(req, i),
++                                              GNTMAP_host_map, handle);
++                      pending_handle(req, i) = SCSIBACK_INVALID_HANDLE;
++                      invcount++;
++              }
++
++              err = HYPERVISOR_grant_table_op(
++                      GNTTABOP_unmap_grant_ref, unmap, invcount);
++              BUG_ON(err);
++              kfree(req->sgl);
++      }
++
++      return;
++}
++
++
++static pending_req_t * alloc_req(struct vscsibk_info *info)
++{
++      pending_req_t *req = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&pending_free_lock, flags);
++      if (!list_empty(&pending_free)) {
++              req = list_entry(pending_free.next, pending_req_t, free_list);
++              list_del(&req->free_list);
++      }
++      spin_unlock_irqrestore(&pending_free_lock, flags);
++      return req;
++}
++
++
++static void free_req(pending_req_t *req)
++{
++      unsigned long flags;
++      int was_empty;
++
++      spin_lock_irqsave(&pending_free_lock, flags);
++      was_empty = list_empty(&pending_free);
++      list_add(&req->free_list, &pending_free);
++      spin_unlock_irqrestore(&pending_free_lock, flags);
++      if (was_empty)
++              wake_up(&pending_free_wq);
++}
++
++
++static void scsiback_notify_work(struct vscsibk_info *info)
++{
++      info->waiting_reqs = 1;
++      wake_up(&info->wq);
++}
++
++void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result,
++                      uint32_t resid, pending_req_t *pending_req)
++{
++      vscsiif_response_t *ring_res;
++      struct vscsibk_info *info = pending_req->info;
++      int notify;
++      int more_to_do = 1;
++      struct scsi_sense_hdr sshdr;
++      unsigned long flags;
++
++      DPRINTK("%s\n",__FUNCTION__);
++
++      spin_lock_irqsave(&info->ring_lock, flags);
++
++      ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt);
++      info->ring.rsp_prod_pvt++;
++
++      ring_res->rslt   = result;
++      ring_res->rqid   = pending_req->rqid;
++
++      if (sense_buffer != NULL) {
++              if (scsi_normalize_sense(sense_buffer,
++                      sizeof(sense_buffer), &sshdr)) {
++
++                      int len = 8 + sense_buffer[7];
++
++                      if (len > VSCSIIF_SENSE_BUFFERSIZE)
++                              len = VSCSIIF_SENSE_BUFFERSIZE;
++
++                      memcpy(ring_res->sense_buffer, sense_buffer, len);
++                      ring_res->sense_len = len;
++              }
++      } else {
++              ring_res->sense_len = 0;
++      }
++
++      ring_res->residual_len = resid;
++
++      RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info->ring, notify);
++      if (info->ring.rsp_prod_pvt == info->ring.req_cons) {
++              RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do);
++      } else if (RING_HAS_UNCONSUMED_REQUESTS(&info->ring)) {
++              more_to_do = 1;
++      }
++      
++      spin_unlock_irqrestore(&info->ring_lock, flags);
++
++      if (more_to_do)
++              scsiback_notify_work(info);
++
++      if (notify)
++              notify_remote_via_irq(info->irq);
++
++      free_req(pending_req);
++}
++
++static void scsiback_print_status(char *sense_buffer, int errors,
++                                      pending_req_t *pending_req)
++{
++      struct scsi_device *sdev = pending_req->sdev;
++      
++      pr_err("scsiback: %d:%d:%d:%d ",
++             sdev->host->host_no, sdev->channel, sdev->id, sdev->lun);
++      pr_err("status = 0x%02x, message = 0x%02x, host = 0x%02x,"
++             " driver = 0x%02x\n",
++             status_byte(errors), msg_byte(errors),
++             host_byte(errors), driver_byte(errors));
++
++      pr_err("scsiback: cmnd[0]=0x%02X\n", pending_req->cmnd[0]);
++
++      if (CHECK_CONDITION & status_byte(errors))
++              __scsi_print_sense("scsiback", sense_buffer, SCSI_SENSE_BUFFERSIZE);
++}
++
++
++static void scsiback_cmd_done(struct request *req, int uptodate)
++{
++      pending_req_t *pending_req = req->end_io_data;
++      unsigned char *sense_buffer;
++      unsigned int resid;
++      int errors;
++
++      sense_buffer = req->sense;
++      resid        = blk_rq_bytes(req);
++      errors       = req->errors;
++
++      if (errors != 0) {
++              if (log_print_stat)
++                      scsiback_print_status(sense_buffer, errors, pending_req);
++      }
++
++      /* The Host mode is through as for Emulation. */
++      if (pending_req->info->feature != VSCSI_TYPE_HOST)
++              scsiback_rsp_emulation(pending_req);
++
++      scsiback_fast_flush_area(pending_req);
++      scsiback_do_resp_with_sense(sense_buffer, errors, resid, pending_req);
++      scsiback_put(pending_req->info);
++
++      __blk_put_request(req->q, req);
++}
++
++
++static int scsiback_gnttab_data_map(vscsiif_request_t *ring_req,
++                                      pending_req_t *pending_req)
++{
++      u32 flags;
++      int write;
++      int i, err = 0;
++      unsigned int data_len = 0;
++      struct gnttab_map_grant_ref map[VSCSIIF_SG_TABLESIZE];
++      struct vscsibk_info *info   = pending_req->info;
++
++      int data_dir = (int)pending_req->sc_data_direction;
++      unsigned int nr_segments = (unsigned int)pending_req->nr_segments;
++
++      write = (data_dir == DMA_TO_DEVICE);
++
++      if (nr_segments) {
++              struct scatterlist *sg;
++
++              /* free of (sgl) in fast_flush_area()*/
++              pending_req->sgl = kmalloc(sizeof(struct scatterlist) * nr_segments,
++                                              GFP_KERNEL);
++              if (!pending_req->sgl) {
++                      pr_err("scsiback: %s: kmalloc() error\n", __FUNCTION__);
++                      return -ENOMEM;
++              }
++
++              sg_init_table(pending_req->sgl, nr_segments);
++
++              flags = GNTMAP_host_map;
++              if (write)
++                      flags |= GNTMAP_readonly;
++
++              for (i = 0; i < nr_segments; i++)
++                      gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
++                                              ring_req->seg[i].gref,
++                                              info->domid);
++
++              err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nr_segments);
++              BUG_ON(err);
++
++              for_each_sg (pending_req->sgl, sg, nr_segments, i) {
++                      struct page *pg;
++
++                      /* Retry maps with GNTST_eagain */
++                      if (unlikely(map[i].status == GNTST_eagain))
++                              gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
++                      if (unlikely(map[i].status != GNTST_okay)) {
++                              pr_err("scsiback: invalid buffer -- could not remap it\n");
++                              map[i].handle = SCSIBACK_INVALID_HANDLE;
++                              err |= 1;
++                      }
++
++                      pending_handle(pending_req, i) = map[i].handle;
++
++                      if (err)
++                              continue;
++
++                      pg = pending_pages[vaddr_pagenr(pending_req, i)];
++
++                      set_phys_to_machine(page_to_pfn(pg),
++                              FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
++
++                      sg_set_page(sg, pg, ring_req->seg[i].length,
++                                  ring_req->seg[i].offset);
++                      data_len += sg->length;
++
++                      barrier();
++                      if (sg->offset >= PAGE_SIZE ||
++                          sg->length > PAGE_SIZE ||
++                          sg->offset + sg->length > PAGE_SIZE)
++                              err |= 1;
++
++              }
++
++              if (err)
++                      goto fail_flush;
++      }
++      
++      pending_req->request_bufflen = data_len;
++      
++      return 0;
++      
++fail_flush:
++      scsiback_fast_flush_area(pending_req);
++      return -ENOMEM;
++}
++
++/* quoted scsi_lib.c/scsi_bi_endio */
++static void scsiback_bi_endio(struct bio *bio, int error)
++{
++      bio_put(bio);
++}
++
++
++
++/* quoted scsi_lib.c/scsi_req_map_sg . */
++static struct bio *request_map_sg(pending_req_t *pending_req)
++{
++      struct request_queue *q = pending_req->sdev->request_queue;
++      unsigned int nsegs = (unsigned int)pending_req->nr_segments;
++      unsigned int i, len, bytes, off, nr_pages, nr_vecs = 0;
++      struct scatterlist *sg;
++      struct page *page;
++      struct bio *bio = NULL, *bio_first = NULL, *bio_last = NULL;
++      int err;
++
++      for_each_sg (pending_req->sgl, sg, nsegs, i) {
++              page = sg_page(sg);
++              off = sg->offset;
++              len = sg->length;
++
++              nr_pages = (len + off + PAGE_SIZE - 1) >> PAGE_SHIFT;
++              while (len > 0) {
++                      bytes = min_t(unsigned int, len, PAGE_SIZE - off);
++
++                      if (!bio) {
++                              nr_vecs = min_t(unsigned int, BIO_MAX_PAGES,
++                                              nr_pages);
++                              nr_pages -= nr_vecs;
++                              bio = bio_alloc(GFP_KERNEL, nr_vecs);
++                              if (!bio) {
++                                      err = -ENOMEM;
++                                      goto free_bios;
++                              }
++                              bio->bi_end_io = scsiback_bi_endio;
++                              if (bio_last)
++                                      bio_last->bi_next = bio;
++                              else
++                                      bio_first = bio;
++                              bio_last = bio;
++                      }
++
++                      if (bio_add_pc_page(q, bio, page, bytes, off) !=
++                                              bytes) {
++                              bio_put(bio);
++                              err = -EINVAL;
++                              goto free_bios;
++                      }
++
++                      if (bio->bi_vcnt >= nr_vecs) {
++                              bio->bi_flags &= ~(1 << BIO_SEG_VALID);
++                              if (pending_req->sc_data_direction == WRITE)
++                                      bio->bi_rw |= REQ_WRITE;
++                              bio = NULL;
++                      }
++
++                      page++;
++                      len -= bytes;
++                      off = 0;
++              }
++      }
++
++      return bio_first;
++
++free_bios:
++      while ((bio = bio_first) != NULL) {
++              bio_first = bio->bi_next;
++              bio_put(bio);
++      }
++
++      return ERR_PTR(err);
++}
++
++
++void scsiback_cmd_exec(pending_req_t *pending_req)
++{
++      int cmd_len  = (int)pending_req->cmd_len;
++      int data_dir = (int)pending_req->sc_data_direction;
++      unsigned int timeout;
++      struct request *rq;
++      int write;
++
++      DPRINTK("%s\n",__FUNCTION__);
++
++      /* because it doesn't timeout backend earlier than frontend.*/
++      if (pending_req->timeout_per_command)
++              timeout = pending_req->timeout_per_command * HZ;
++      else
++              timeout = VSCSIIF_TIMEOUT;
++
++      write = (data_dir == DMA_TO_DEVICE);
++      if (pending_req->nr_segments) {
++              struct bio *bio = request_map_sg(pending_req);
++
++              if (IS_ERR(bio)) {
++                      pr_err("scsiback: SG Request Map Error\n");
++                      return;
++              }
++
++              rq = blk_make_request(pending_req->sdev->request_queue, bio,
++                                    GFP_KERNEL);
++              if (IS_ERR(rq)) {
++                      pr_err("scsiback: Make Request Error\n");
++                      return;
++              }
++
++              rq->buffer = NULL;
++      } else {
++              rq = blk_get_request(pending_req->sdev->request_queue, write,
++                                   GFP_KERNEL);
++              if (unlikely(!rq)) {
++                      pr_err("scsiback: Get Request Error\n");
++                      return;
++              }
++      }
++
++      rq->cmd_type = REQ_TYPE_BLOCK_PC;
++      rq->cmd_len = cmd_len;
++      memcpy(rq->cmd, pending_req->cmnd, cmd_len);
++
++      memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE);
++      rq->sense       = pending_req->sense_buffer;
++      rq->sense_len = 0;
++
++      /* not allowed to retry in backend.                   */
++      rq->retries   = 0;
++      rq->timeout   = timeout;
++      rq->end_io_data = pending_req;
++
++      scsiback_get(pending_req->info);
++      blk_execute_rq_nowait(rq->q, NULL, rq, 1, scsiback_cmd_done);
++
++      return ;
++}
++
++
++static void scsiback_device_reset_exec(pending_req_t *pending_req)
++{
++      struct vscsibk_info *info = pending_req->info;
++      int err;
++      struct scsi_device *sdev = pending_req->sdev;
++
++      scsiback_get(info);
++      err = scsi_reset_provider(sdev, SCSI_TRY_RESET_DEVICE);
++
++      scsiback_do_resp_with_sense(NULL, err, 0, pending_req);
++      scsiback_put(info);
++
++      return;
++}
++
++
++irqreturn_t scsiback_intr(int irq, void *dev_id)
++{
++      scsiback_notify_work((struct vscsibk_info *)dev_id);
++      return IRQ_HANDLED;
++}
++
++static int prepare_pending_reqs(struct vscsibk_info *info,
++              vscsiif_request_t *ring_req, pending_req_t *pending_req)
++{
++      struct scsi_device *sdev;
++      struct ids_tuple vir;
++      int err = -EINVAL;
++
++      DPRINTK("%s\n",__FUNCTION__);
++
++      pending_req->rqid       = ring_req->rqid;
++      pending_req->act        = ring_req->act;
++
++      pending_req->info       = info;
++
++      pending_req->v_chn = vir.chn = ring_req->channel;
++      pending_req->v_tgt = vir.tgt = ring_req->id;
++      vir.lun = ring_req->lun;
++
++      rmb();
++      sdev = scsiback_do_translation(info, &vir);
++      if (!sdev) {
++              pending_req->sdev = NULL;
++              DPRINTK("scsiback: doesn't exist.\n");
++              err = -ENODEV;
++              goto invalid_value;
++      }
++      pending_req->sdev = sdev;
++
++      /* request range check from frontend */
++      pending_req->sc_data_direction = ring_req->sc_data_direction;
++      barrier();
++      if ((pending_req->sc_data_direction != DMA_BIDIRECTIONAL) &&
++              (pending_req->sc_data_direction != DMA_TO_DEVICE) &&
++              (pending_req->sc_data_direction != DMA_FROM_DEVICE) &&
++              (pending_req->sc_data_direction != DMA_NONE)) {
++              DPRINTK("scsiback: invalid parameter data_dir = %d\n",
++                      pending_req->sc_data_direction);
++              err = -EINVAL;
++              goto invalid_value;
++      }
++
++      pending_req->nr_segments = ring_req->nr_segments;
++      barrier();
++      if (pending_req->nr_segments > VSCSIIF_SG_TABLESIZE) {
++              DPRINTK("scsiback: invalid parameter nr_seg = %d\n",
++                      pending_req->nr_segments);
++              err = -EINVAL;
++              goto invalid_value;
++      }
++
++      pending_req->cmd_len = ring_req->cmd_len;
++      barrier();
++      if (pending_req->cmd_len > VSCSIIF_MAX_COMMAND_SIZE) {
++              DPRINTK("scsiback: invalid parameter cmd_len = %d\n",
++                      pending_req->cmd_len);
++              err = -EINVAL;
++              goto invalid_value;
++      }
++      memcpy(pending_req->cmnd, ring_req->cmnd, pending_req->cmd_len);
++      
++      pending_req->timeout_per_command = ring_req->timeout_per_command;
++
++      if(scsiback_gnttab_data_map(ring_req, pending_req)) {
++              DPRINTK("scsiback: invalid buffer\n");
++              err = -EINVAL;
++              goto invalid_value;
++      }
++
++      return 0;
++
++invalid_value:
++      return err;
++}
++
++
++static int scsiback_do_cmd_fn(struct vscsibk_info *info)
++{
++      struct vscsiif_back_ring *ring = &info->ring;
++      vscsiif_request_t  *ring_req;
++
++      pending_req_t *pending_req;
++      RING_IDX rc, rp;
++      int err, more_to_do = 0;
++
++      DPRINTK("%s\n",__FUNCTION__);
++
++      rc = ring->req_cons;
++      rp = ring->sring->req_prod;
++      rmb();
++
++      while ((rc != rp)) {
++              if (RING_REQUEST_CONS_OVERFLOW(ring, rc))
++                      break;
++              pending_req = alloc_req(info);
++              if (NULL == pending_req) {
++                      more_to_do = 1;
++                      break;
++              }
++
++              ring_req = RING_GET_REQUEST(ring, rc);
++              ring->req_cons = ++rc;
++
++              err = prepare_pending_reqs(info, ring_req,
++                                              pending_req);
++              if (err == -EINVAL) {
++                      scsiback_do_resp_with_sense(NULL, (DRIVER_ERROR << 24),
++                              0, pending_req);
++                      continue;
++              } else if (err == -ENODEV) {
++                      scsiback_do_resp_with_sense(NULL, (DID_NO_CONNECT << 16),
++                              0, pending_req);
++                      continue;
++              }
++
++              if (pending_req->act == VSCSIIF_ACT_SCSI_CDB) {
++
++                      /* The Host mode is through as for Emulation. */
++                      if (info->feature == VSCSI_TYPE_HOST)
++                              scsiback_cmd_exec(pending_req);
++                      else
++                              scsiback_req_emulation_or_cmdexec(pending_req);
++
++              } else if (pending_req->act == VSCSIIF_ACT_SCSI_RESET) {
++                      scsiback_device_reset_exec(pending_req);
++              } else {
++                      pr_err("scsiback: invalid parameter for request\n");
++                      scsiback_do_resp_with_sense(NULL, (DRIVER_ERROR << 24),
++                              0, pending_req);
++                      continue;
++              }
++      }
++
++      if (RING_HAS_UNCONSUMED_REQUESTS(ring))
++              more_to_do = 1;
++
++      /* Yield point for this unbounded loop. */
++      cond_resched();
++
++      return more_to_do;
++}
++
++
++int scsiback_schedule(void *data)
++{
++      struct vscsibk_info *info = (struct vscsibk_info *)data;
++
++      DPRINTK("%s\n",__FUNCTION__);
++
++      while (!kthread_should_stop()) {
++              wait_event_interruptible(
++                      info->wq,
++                      info->waiting_reqs || kthread_should_stop());
++              wait_event_interruptible(
++                      pending_free_wq,
++                      !list_empty(&pending_free) || kthread_should_stop());
++
++              info->waiting_reqs = 0;
++              smp_mb();
++
++              if (scsiback_do_cmd_fn(info))
++                      info->waiting_reqs = 1;
++      }
++
++      return 0;
++}
++
++
++static int __init scsiback_init(void)
++{
++      int i, mmap_pages;
++
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      mmap_pages = vscsiif_reqs * VSCSIIF_SG_TABLESIZE;
++
++      pending_reqs          = kzalloc(sizeof(pending_reqs[0]) *
++                                      vscsiif_reqs, GFP_KERNEL);
++      pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
++                                      mmap_pages, GFP_KERNEL);
++      pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
++
++      if (!pending_reqs || !pending_grant_handles || !pending_pages)
++              goto out_of_memory;
++
++      for (i = 0; i < mmap_pages; i++)
++              pending_grant_handles[i] = SCSIBACK_INVALID_HANDLE;
++
++      if (scsiback_interface_init() < 0)
++              goto out_of_kmem;
++
++      INIT_LIST_HEAD(&pending_free);
++
++      for (i = 0; i < vscsiif_reqs; i++)
++              list_add_tail(&pending_reqs[i].free_list, &pending_free);
++
++      if (scsiback_xenbus_init())
++              goto out_of_xenbus;
++
++      scsiback_emulation_init();
++
++      return 0;
++
++out_of_xenbus:
++      scsiback_xenbus_unregister();
++out_of_kmem:
++      scsiback_interface_exit();
++out_of_memory:
++      kfree(pending_reqs);
++      kfree(pending_grant_handles);
++      free_empty_pages_and_pagevec(pending_pages, mmap_pages);
++      pr_err("scsiback: %s: out of memory\n", __FUNCTION__);
++      return -ENOMEM;
++}
++
++#if 0
++static void __exit scsiback_exit(void)
++{
++      scsiback_xenbus_unregister();
++      scsiback_interface_exit();
++      kfree(pending_reqs);
++      kfree(pending_grant_handles);
++      free_empty_pages_and_pagevec(pending_pages, (vscsiif_reqs * VSCSIIF_SG_TABLESIZE));
++
++}
++#endif
++
++module_init(scsiback_init);
++
++#if 0
++module_exit(scsiback_exit);
++#endif
++
++MODULE_DESCRIPTION("Xen SCSI backend driver");
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/scsiback/translate.c

index 0000000,0000000..c82e5b8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/scsiback/translate.c
@@@ -1,0 -1,0 +1,168 @@@
++/*
++ * Xen SCSI backend driver
++ *
++ * Copyright (c) 2008, FUJITSU Limited
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/list.h>
++#include <linux/gfp.h>
++
++#include "common.h"
++
++/*
++  Initialize the translation entry list
++*/
++void scsiback_init_translation_table(struct vscsibk_info *info)
++{
++      INIT_LIST_HEAD(&info->v2p_entry_lists);
++      spin_lock_init(&info->v2p_lock);
++}
++
++
++/*
++  Add a new translation entry
++*/
++int scsiback_add_translation_entry(struct vscsibk_info *info,
++                      struct scsi_device *sdev, struct ids_tuple *v)
++{
++      int err = 0;
++      struct v2p_entry *entry;
++      struct v2p_entry *new;
++      struct list_head *head = &(info->v2p_entry_lists);
++      unsigned long flags;
++      
++      spin_lock_irqsave(&info->v2p_lock, flags);
++
++      /* Check double assignment to identical virtual ID */
++      list_for_each_entry(entry, head, l) {
++              if ((entry->v.chn == v->chn) &&
++                  (entry->v.tgt == v->tgt) &&
++                  (entry->v.lun == v->lun)) {
++                      pr_warning("scsiback: Virtual ID is already used. "
++                                 "Assignment was not performed.\n");
++                      err = -EEXIST;
++                      goto out;
++              }
++
++      }
++
++      /* Create a new translation entry and add to the list */
++      if ((new = kmalloc(sizeof(struct v2p_entry), GFP_ATOMIC)) == NULL) {
++              pr_err("scsiback: %s: kmalloc() error\n", __FUNCTION__);
++              err = -ENOMEM;
++              goto out;
++      }
++      new->v = *v;
++      new->sdev = sdev;
++      list_add_tail(&new->l, head);
++
++out:  
++      spin_unlock_irqrestore(&info->v2p_lock, flags);
++      return err;
++}
++
++
++/*
++  Delete the translation entry specfied
++*/
++int scsiback_del_translation_entry(struct vscsibk_info *info,
++                              struct ids_tuple *v)
++{
++      struct v2p_entry *entry;
++      struct list_head *head = &(info->v2p_entry_lists);
++      unsigned long flags;
++
++      spin_lock_irqsave(&info->v2p_lock, flags);
++      /* Find out the translation entry specified */
++      list_for_each_entry(entry, head, l) {
++              if ((entry->v.chn == v->chn) &&
++                  (entry->v.tgt == v->tgt) &&
++                  (entry->v.lun == v->lun)) {
++                      goto found;
++              }
++      }
++
++      spin_unlock_irqrestore(&info->v2p_lock, flags);
++      return 1;
++
++found:
++      /* Delete the translation entry specfied */
++      scsi_device_put(entry->sdev);
++      list_del(&entry->l);
++      kfree(entry);
++
++      spin_unlock_irqrestore(&info->v2p_lock, flags);
++      return 0;
++}
++
++
++/*
++  Perform virtual to physical translation
++*/
++struct scsi_device *scsiback_do_translation(struct vscsibk_info *info,
++                      struct ids_tuple *v)
++{
++      struct v2p_entry *entry;
++      struct list_head *head = &(info->v2p_entry_lists);
++      struct scsi_device *sdev = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&info->v2p_lock, flags);
++      list_for_each_entry(entry, head, l) {
++              if ((entry->v.chn == v->chn) &&
++                  (entry->v.tgt == v->tgt) &&
++                  (entry->v.lun == v->lun)) {
++                      sdev = entry->sdev;
++                      goto out;
++              }
++      }
++out:
++      spin_unlock_irqrestore(&info->v2p_lock, flags);
++      return sdev;
++}
++
++
++/*
++  Release the translation entry specfied
++*/
++void scsiback_release_translation_entry(struct vscsibk_info *info)
++{
++      struct v2p_entry *entry, *tmp;
++      struct list_head *head = &(info->v2p_entry_lists);
++      unsigned long flags;
++
++      spin_lock_irqsave(&info->v2p_lock, flags);
++      list_for_each_entry_safe(entry, tmp, head, l) {
++              scsi_device_put(entry->sdev);
++              list_del(&entry->l);
++              kfree(entry);
++      }
++
++      spin_unlock_irqrestore(&info->v2p_lock, flags);
++      return;
++
++}
diff --cc drivers/xen/scsiback/xenbus.c

index 0000000,0000000..fc78dff

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/scsiback/xenbus.c
@@@ -1,0 -1,0 +1,379 @@@
++/*
++ * Xen SCSI backend driver
++ *
++ * Copyright (c) 2008, FUJITSU Limited
++ *
++ * Based on the blkback driver code.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <stdarg.h>
++#include <linux/module.h>
++#include <linux/kthread.h>
++#include <scsi/scsi.h>
++#include <scsi/scsi_host.h>
++#include <scsi/scsi_device.h>
++
++#include "common.h"
++
++struct backend_info
++{
++      struct xenbus_device *dev;
++      struct vscsibk_info *info;
++};
++
++
++static int __vscsiif_name(struct backend_info *be, char *buf)
++{
++      struct xenbus_device *dev = be->dev;
++      unsigned int domid, id;
++
++      sscanf(dev->nodename, "backend/vscsi/%u/%u", &domid, &id);
++      snprintf(buf, TASK_COMM_LEN, "vscsi.%u.%u", be->info->domid, id);
++
++      return 0;
++}
++
++static int scsiback_map(struct backend_info *be)
++{
++      struct xenbus_device *dev = be->dev;
++      unsigned long ring_ref;
++      unsigned int evtchn;
++      int err;
++      char name[TASK_COMM_LEN];
++
++      err = xenbus_gather(XBT_NIL, dev->otherend,
++                      "ring-ref", "%lu", &ring_ref,
++                      "event-channel", "%u", &evtchn, NULL);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "reading %s ring", dev->otherend);
++              return err;
++      }
++
++      err = scsiback_init_sring(be->info, ring_ref, evtchn);
++      if (err)
++              return err;
++
++      err = __vscsiif_name(be, name);
++      if (err) {
++              xenbus_dev_error(dev, err, "get scsiback dev name");
++              return err;
++      }
++
++      be->info->kthread = kthread_run(scsiback_schedule, be->info, name);
++      if (IS_ERR(be->info->kthread)) {
++              err = PTR_ERR(be->info->kthread);
++              be->info->kthread = NULL;
++              xenbus_dev_error(be->dev, err, "start vscsiif");
++              return err;
++      }
++
++      return 0;
++}
++
++
++struct scsi_device *scsiback_get_scsi_device(struct ids_tuple *phy)
++{
++      struct Scsi_Host *shost;
++      struct scsi_device *sdev = NULL;
++
++      shost = scsi_host_lookup(phy->hst);
++      if (IS_ERR(shost)) {
++              pr_err("scsiback: host%d doesn't exist\n", phy->hst);
++              return NULL;
++      }
++      sdev   = scsi_device_lookup(shost, phy->chn, phy->tgt, phy->lun);
++      if (!sdev) {
++              pr_err("scsiback: %d:%d:%d:%d doesn't exist\n",
++                     phy->hst, phy->chn, phy->tgt, phy->lun);
++              scsi_host_put(shost);
++              return NULL;
++      }
++
++      scsi_host_put(shost);
++      return (sdev);
++}
++
++#define VSCSIBACK_OP_ADD_OR_DEL_LUN   1
++#define VSCSIBACK_OP_UPDATEDEV_STATE  2
++
++
++static void scsiback_do_lun_hotplug(struct backend_info *be, int op)
++{
++      int i, err = 0;
++      struct ids_tuple phy, vir;
++      int device_state;
++      char str[64], state_str[64];
++      char **dir;
++      unsigned int dir_n = 0;
++      struct xenbus_device *dev = be->dev;
++      struct scsi_device *sdev;
++
++      dir = xenbus_directory(XBT_NIL, dev->nodename, "vscsi-devs", &dir_n);
++      if (IS_ERR(dir))
++              return;
++
++      for (i = 0; i < dir_n; i++) {
++              
++              /* read status */
++              snprintf(state_str, sizeof(state_str), "vscsi-devs/%s/state", dir[i]);
++              err = xenbus_scanf(XBT_NIL, dev->nodename, state_str, "%u",
++                      &device_state);
++              if (XENBUS_EXIST_ERR(err))
++                      continue;
++
++              /* physical SCSI device */
++              snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", dir[i]);
++              err = xenbus_scanf(XBT_NIL, dev->nodename, str,
++                      "%u:%u:%u:%u", &phy.hst, &phy.chn, &phy.tgt, &phy.lun);
++              if (XENBUS_EXIST_ERR(err)) {
++                      xenbus_printf(XBT_NIL, dev->nodename, state_str,
++                                      "%d", XenbusStateClosed);
++                      continue;
++              }
++
++              /* virtual SCSI device */
++              snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]);
++              err = xenbus_scanf(XBT_NIL, dev->nodename, str,
++                      "%u:%u:%u:%u", &vir.hst, &vir.chn, &vir.tgt, &vir.lun);
++              if (XENBUS_EXIST_ERR(err)) {
++                      xenbus_printf(XBT_NIL, dev->nodename, state_str,
++                                      "%d", XenbusStateClosed);
++                      continue;
++              }
++
++              switch (op) {
++              case VSCSIBACK_OP_ADD_OR_DEL_LUN:
++                      if (device_state == XenbusStateInitialising) {
++                              sdev = scsiback_get_scsi_device(&phy);
++                              if (!sdev)
++                                      xenbus_printf(XBT_NIL, dev->nodename, state_str, 
++                                                          "%d", XenbusStateClosed);
++                              else {
++                                      err = scsiback_add_translation_entry(be->info, sdev, &vir);
++                                      if (!err) {
++                                              if (xenbus_printf(XBT_NIL, dev->nodename, state_str, 
++                                                                  "%d", XenbusStateInitialised)) {
++                                                      pr_err("scsiback: xenbus_printf error %s\n",
++                                                             state_str);
++                                                      scsiback_del_translation_entry(be->info, &vir);
++                                              }
++                                      } else {
++                                              scsi_device_put(sdev);
++                                              xenbus_printf(XBT_NIL, dev->nodename, state_str, 
++                                                                  "%d", XenbusStateClosed);
++                                      }
++                              }
++                      }
++
++                      if (device_state == XenbusStateClosing) {
++                              if (!scsiback_del_translation_entry(be->info, &vir)) {
++                                      if (xenbus_printf(XBT_NIL, dev->nodename, state_str, 
++                                                          "%d", XenbusStateClosed))
++                                              pr_err("scsiback: xenbus_printf error %s\n",
++                                                     state_str);
++                              }
++                      }
++                      break;
++
++              case VSCSIBACK_OP_UPDATEDEV_STATE:
++                      if (device_state == XenbusStateInitialised) {
++                              /* modify vscsi-devs/dev-x/state */
++                              if (xenbus_printf(XBT_NIL, dev->nodename, state_str, 
++                                                  "%d", XenbusStateConnected)) {
++                                      pr_err("scsiback: xenbus_printf error %s\n",
++                                             state_str);
++                                      scsiback_del_translation_entry(be->info, &vir);
++                                      xenbus_printf(XBT_NIL, dev->nodename, state_str, 
++                                                          "%d", XenbusStateClosed);
++                              }
++                      }
++                      break;
++              /*When it is necessary, processing is added here.*/
++              default:
++                      break;
++              }
++      }
++
++      kfree(dir);
++      return ;
++}
++
++
++static void scsiback_frontend_changed(struct xenbus_device *dev,
++                                      enum xenbus_state frontend_state)
++{
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++      int err;
++
++      switch (frontend_state) {
++      case XenbusStateInitialising:
++              break;
++      case XenbusStateInitialised:
++              err = scsiback_map(be);
++              if (err)
++                      break;
++
++              scsiback_do_lun_hotplug(be, VSCSIBACK_OP_ADD_OR_DEL_LUN);
++              xenbus_switch_state(dev, XenbusStateConnected);
++
++              break;
++      case XenbusStateConnected:
++
++              scsiback_do_lun_hotplug(be, VSCSIBACK_OP_UPDATEDEV_STATE);
++
++              if (dev->state == XenbusStateConnected)
++                      break;
++
++              xenbus_switch_state(dev, XenbusStateConnected);
++
++              break;
++
++      case XenbusStateClosing:
++              scsiback_disconnect(be->info);
++              xenbus_switch_state(dev, XenbusStateClosing);
++              break;
++
++      case XenbusStateClosed:
++              xenbus_switch_state(dev, XenbusStateClosed);
++              if (xenbus_dev_is_online(dev))
++                      break;
++              /* fall through if not online */
++      case XenbusStateUnknown:
++              device_unregister(&dev->dev);
++              break;
++
++      case XenbusStateReconfiguring:
++              scsiback_do_lun_hotplug(be, VSCSIBACK_OP_ADD_OR_DEL_LUN);
++
++              xenbus_switch_state(dev, XenbusStateReconfigured);
++
++              break;
++
++      default:
++              xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++                                      frontend_state);
++              break;
++      }
++}
++
++
++static int scsiback_remove(struct xenbus_device *dev)
++{
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++      if (be->info) {
++              scsiback_disconnect(be->info);
++              scsiback_release_translation_entry(be->info);
++              scsiback_free(be->info);
++              be->info = NULL;
++      }
++
++      kfree(be);
++      dev_set_drvdata(&dev->dev, NULL);
++
++      return 0;
++}
++
++
++static int scsiback_probe(struct xenbus_device *dev,
++                         const struct xenbus_device_id *id)
++{
++      int err;
++      unsigned val = 0;
++
++      struct backend_info *be = kzalloc(sizeof(struct backend_info),
++                                        GFP_KERNEL);
++
++      DPRINTK("%p %d\n", dev, dev->otherend_id);
++
++      if (!be) {
++              xenbus_dev_fatal(dev, -ENOMEM,
++                               "allocating backend structure");
++              return -ENOMEM;
++      }
++      be->dev = dev;
++      dev_set_drvdata(&dev->dev, be);
++
++      be->info = vscsibk_info_alloc(dev->otherend_id);
++      if (IS_ERR(be->info)) {
++              err = PTR_ERR(be->info);
++              be->info = NULL;
++              xenbus_dev_fatal(dev, err, "creating scsihost interface");
++              goto fail;
++      }
++
++      be->info->dev = dev;
++      be->info->irq = 0;
++      be->info->feature = 0;  /*default not HOSTMODE.*/
++
++      scsiback_init_translation_table(be->info);
++
++      err = xenbus_scanf(XBT_NIL, dev->nodename,
++                              "feature-host", "%d", &val);
++      if (XENBUS_EXIST_ERR(err))
++              val = 0;
++
++      if (val)
++              be->info->feature = VSCSI_TYPE_HOST;
++
++      err = xenbus_switch_state(dev, XenbusStateInitWait);
++      if (err)
++              goto fail;
++
++      return 0;
++
++
++fail:
++      pr_warning("scsiback: %s failed\n",__FUNCTION__);
++      scsiback_remove(dev);
++
++      return err;
++}
++
++
++static const struct xenbus_device_id scsiback_ids[] = {
++      { "vscsi" },
++      { "" }
++};
++
++static struct xenbus_driver scsiback = {
++      .name                   = "vscsi",
++      .ids                    = scsiback_ids,
++      .probe                  = scsiback_probe,
++      .remove                 = scsiback_remove,
++      .otherend_changed       = scsiback_frontend_changed
++};
++
++int scsiback_xenbus_init(void)
++{
++      return xenbus_register_backend(&scsiback);
++}
++
++void scsiback_xenbus_unregister(void)
++{
++      xenbus_unregister_driver(&scsiback);
++}
diff --cc drivers/xen/scsifront/Makefile

index 0000000,0000000..58ee185

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/scsifront/Makefile
@@@ -1,0 -1,0 +1,3 @@@
++
++obj-$(CONFIG_XEN_SCSI_FRONTEND)       := xenscsi.o
++xenscsi-objs := scsifront.o xenbus.o
diff --cc drivers/xen/scsifront/common.h

index 0000000,0000000..1d42481

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/scsifront/common.h
@@@ -1,0 -1,0 +1,135 @@@
++/*
++ * Xen SCSI frontend driver
++ *
++ * Copyright (c) 2008, FUJITSU Limited
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_DRIVERS_SCSIFRONT_H__
++#define __XEN_DRIVERS_SCSIFRONT_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/kernel.h>
++#include <linux/device.h>
++#include <linux/kthread.h>
++#include <linux/wait.h>
++#include <linux/interrupt.h>
++#include <linux/spinlock.h>
++#include <linux/sched.h>
++#include <linux/blkdev.h>
++#include <scsi/scsi_cmnd.h>
++#include <scsi/scsi_device.h>
++#include <scsi/scsi.h>
++#include <scsi/scsi_host.h>
++#include <xen/xenbus.h>
++#include <xen/gnttab.h>
++#include <xen/evtchn.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/io/ring.h>
++#include <xen/interface/io/vscsiif.h>
++#include <xen/interface/grant_table.h>
++#include <xen/interface/io/protocols.h>
++#include <asm/delay.h>
++#include <asm/hypervisor.h>
++#include <asm/maddr.h>
++
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
++#define GRANT_INVALID_REF     0
++#define VSCSI_IN_ABORT                1
++#define VSCSI_IN_RESET                2
++
++/* tuning point*/
++#define VSCSIIF_DEFAULT_CMD_PER_LUN 10
++#define VSCSIIF_MAX_TARGET          64
++#define VSCSIIF_MAX_LUN             255
++
++#define VSCSIIF_RING_SIZE     __CONST_RING_SIZE(vscsiif, PAGE_SIZE)
++#define VSCSIIF_MAX_REQS      VSCSIIF_RING_SIZE
++
++struct vscsifrnt_shadow {
++      uint16_t next_free;
++      
++      /* command between backend and frontend
++       * VSCSIIF_ACT_SCSI_CDB or VSCSIIF_ACT_SCSI_RESET */
++      unsigned char act;
++      
++      /* do reset function */
++      wait_queue_head_t wq_reset;     /* reset work queue           */
++      int wait_reset;                 /* reset work queue condition */
++      int32_t rslt_reset;             /* reset response status      */
++                                      /* (SUCESS or FAILED)         */
++
++      /* for DMA_TO_DEVICE(1), DMA_FROM_DEVICE(2), DMA_NONE(3) 
++         requests */
++      unsigned int sc_data_direction;
++      
++      /* Number of pieces of scatter-gather */
++      unsigned int nr_segments;
++
++      /* requested struct scsi_cmnd is stored from kernel */
++      unsigned long req_scsi_cmnd;
++      int gref[VSCSIIF_SG_TABLESIZE];
++};
++
++struct vscsifrnt_info {
++      struct xenbus_device *dev;
++
++      struct Scsi_Host *host;
++
++      spinlock_t io_lock;
++      spinlock_t shadow_lock;
++      unsigned int evtchn;
++      unsigned int irq;
++
++      grant_ref_t ring_ref;
++      struct vscsiif_front_ring ring;
++      struct vscsiif_response ring_res;
++
++      struct vscsifrnt_shadow shadow[VSCSIIF_MAX_REQS];
++      uint32_t shadow_free;
++
++      struct task_struct *kthread;
++      wait_queue_head_t wq;
++      unsigned int waiting_resp;
++
++};
++
++#define DPRINTK(_f, _a...)                            \
++      pr_debug("(file=%s, line=%d) " _f,      \
++               __FILE__ , __LINE__ , ## _a )
++
++int scsifront_xenbus_init(void);
++void scsifront_xenbus_unregister(void);
++int scsifront_schedule(void *data);
++irqreturn_t scsifront_intr(int irq, void *dev_id);
++int scsifront_cmd_done(struct vscsifrnt_info *info);
++
++
++#endif /* __XEN_DRIVERS_SCSIFRONT_H__  */
diff --cc drivers/xen/scsifront/scsifront.c

index 0000000,0000000..a850c4e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/scsifront/scsifront.c
@@@ -1,0 -1,0 +1,477 @@@
++/*
++ * Xen SCSI frontend driver
++ *
++ * Copyright (c) 2008, FUJITSU Limited
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++ 
++
++#include <linux/version.h>
++#include "common.h"
++
++static int get_id_from_freelist(struct vscsifrnt_info *info)
++{
++      unsigned long flags;
++      uint32_t free;
++
++      spin_lock_irqsave(&info->shadow_lock, flags);
++
++      free = info->shadow_free;
++      BUG_ON(free > VSCSIIF_MAX_REQS);
++      info->shadow_free = info->shadow[free].next_free;
++      info->shadow[free].next_free = 0x0fff;
++
++      info->shadow[free].wait_reset = 0;
++
++      spin_unlock_irqrestore(&info->shadow_lock, flags);
++
++      return free;
++}
++
++static void add_id_to_freelist(struct vscsifrnt_info *info, uint32_t id)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&info->shadow_lock, flags);
++
++      info->shadow[id].next_free  = info->shadow_free;
++      info->shadow[id].req_scsi_cmnd = 0;
++      info->shadow_free = id;
++
++      spin_unlock_irqrestore(&info->shadow_lock, flags);
++}
++
++
++struct vscsiif_request * scsifront_pre_request(struct vscsifrnt_info *info)
++{
++      struct vscsiif_front_ring *ring = &(info->ring);
++      vscsiif_request_t *ring_req;
++      uint32_t id;
++
++      ring_req = RING_GET_REQUEST(&(info->ring), ring->req_prod_pvt);
++
++      ring->req_prod_pvt++;
++      
++      id = get_id_from_freelist(info);        /* use id by response */
++      ring_req->rqid = (uint16_t)id;
++
++      return ring_req;
++}
++
++
++static void scsifront_notify_work(struct vscsifrnt_info *info)
++{
++      info->waiting_resp = 1;
++      wake_up(&info->wq);
++}
++
++
++static void scsifront_do_request(struct vscsifrnt_info *info)
++{
++      struct vscsiif_front_ring *ring = &(info->ring);
++      unsigned int irq = info->irq;
++      int notify;
++
++      RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(ring, notify);
++      if (notify)
++              notify_remote_via_irq(irq);
++}
++
++irqreturn_t scsifront_intr(int irq, void *dev_id)
++{
++      scsifront_notify_work((struct vscsifrnt_info *)dev_id);
++      return IRQ_HANDLED;
++}
++
++
++static void scsifront_gnttab_done(struct vscsifrnt_shadow *s, uint32_t id)
++{
++      int i;
++
++      if (s->sc_data_direction == DMA_NONE)
++              return;
++
++      if (s->nr_segments) {
++              for (i = 0; i < s->nr_segments; i++) {
++                      if (unlikely(gnttab_query_foreign_access(
++                              s->gref[i]) != 0)) {
++                              pr_alert("scsifront: "
++                                       "grant still in use by backend\n");
++                              BUG();
++                      }
++                      gnttab_end_foreign_access(s->gref[i], 0UL);
++              }
++      }
++
++      return;
++}
++
++
++static void scsifront_cdb_cmd_done(struct vscsifrnt_info *info,
++                     vscsiif_response_t *ring_res)
++{
++      struct scsi_cmnd *sc;
++      uint32_t id;
++      uint8_t sense_len;
++
++      id = ring_res->rqid;
++      sc = (struct scsi_cmnd *)info->shadow[id].req_scsi_cmnd;
++
++      if (sc == NULL)
++              BUG();
++
++      scsifront_gnttab_done(&info->shadow[id], id);
++      add_id_to_freelist(info, id);
++
++      sc->result = ring_res->rslt;
++      scsi_set_resid(sc, ring_res->residual_len);
++
++      if (ring_res->sense_len > VSCSIIF_SENSE_BUFFERSIZE)
++              sense_len = VSCSIIF_SENSE_BUFFERSIZE;
++      else
++              sense_len = ring_res->sense_len;
++
++      if (sense_len)
++              memcpy(sc->sense_buffer, ring_res->sense_buffer, sense_len);
++
++      sc->scsi_done(sc);
++
++      return;
++}
++
++
++static void scsifront_sync_cmd_done(struct vscsifrnt_info *info,
++                              vscsiif_response_t *ring_res)
++{
++      uint16_t id = ring_res->rqid;
++      unsigned long flags;
++      
++      spin_lock_irqsave(&info->shadow_lock, flags);
++      info->shadow[id].wait_reset = 1;
++      info->shadow[id].rslt_reset = ring_res->rslt;
++      spin_unlock_irqrestore(&info->shadow_lock, flags);
++
++      wake_up(&(info->shadow[id].wq_reset));
++}
++
++
++int scsifront_cmd_done(struct vscsifrnt_info *info)
++{
++      vscsiif_response_t *ring_res;
++
++      RING_IDX i, rp;
++      int more_to_do = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&info->io_lock, flags);
++
++      rp = info->ring.sring->rsp_prod;
++      rmb();
++      for (i = info->ring.rsp_cons; i != rp; i++) {
++              
++              ring_res = RING_GET_RESPONSE(&info->ring, i);
++
++              if (info->shadow[ring_res->rqid].act == VSCSIIF_ACT_SCSI_CDB)
++                      scsifront_cdb_cmd_done(info, ring_res);
++              else
++                      scsifront_sync_cmd_done(info, ring_res);
++      }
++
++      info->ring.rsp_cons = i;
++
++      if (i != info->ring.req_prod_pvt) {
++              RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
++      } else {
++              info->ring.sring->rsp_event = i + 1;
++      }
++
++      spin_unlock_irqrestore(&info->io_lock, flags);
++
++
++      /* Yield point for this unbounded loop. */
++      cond_resched();
++
++      return more_to_do;
++}
++
++
++
++
++int scsifront_schedule(void *data)
++{
++      struct vscsifrnt_info *info = (struct vscsifrnt_info *)data;
++
++      while (!kthread_should_stop()) {
++              wait_event_interruptible(
++                      info->wq,
++                      info->waiting_resp || kthread_should_stop());
++
++              info->waiting_resp = 0;
++              smp_mb();
++
++              if (scsifront_cmd_done(info))
++                      info->waiting_resp = 1;
++      }
++
++      return 0;
++}
++
++
++
++static int map_data_for_request(struct vscsifrnt_info *info,
++              struct scsi_cmnd *sc, vscsiif_request_t *ring_req, uint32_t id)
++{
++      grant_ref_t gref_head;
++      struct page *page;
++      int err, ref, ref_cnt = 0;
++      int write = (sc->sc_data_direction == DMA_TO_DEVICE);
++      unsigned int i, nr_pages, off, len, bytes;
++      unsigned long buffer_pfn;
++
++      if (sc->sc_data_direction == DMA_NONE)
++              return 0;
++
++      err = gnttab_alloc_grant_references(VSCSIIF_SG_TABLESIZE, &gref_head);
++      if (err) {
++              pr_err("scsifront: gnttab_alloc_grant_references() error\n");
++              return -ENOMEM;
++      }
++
++      if (scsi_bufflen(sc)) {
++              /* quoted scsi_lib.c/scsi_req_map_sg . */
++              struct scatterlist *sg, *sgl = scsi_sglist(sc);
++              unsigned int data_len = scsi_bufflen(sc);
++
++              nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
++              if (nr_pages > VSCSIIF_SG_TABLESIZE) {
++                      pr_err("scsifront: Unable to map request_buffer for command!\n");
++                      ref_cnt = (-E2BIG);
++                      goto big_to_sg;
++              }
++
++              for_each_sg (sgl, sg, scsi_sg_count(sc), i) {
++                      page = sg_page(sg);
++                      off = sg->offset;
++                      len = sg->length;
++
++                      buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
++
++                      while (len > 0 && data_len > 0) {
++                              /*
++                               * sg sends a scatterlist that is larger than
++                               * the data_len it wants transferred for certain
++                               * IO sizes
++                               */
++                              bytes = min_t(unsigned int, len, PAGE_SIZE - off);
++                              bytes = min(bytes, data_len);
++                              
++                              ref = gnttab_claim_grant_reference(&gref_head);
++                              BUG_ON(ref == -ENOSPC);
++
++                              gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
++                                      buffer_pfn, write);
++
++                              info->shadow[id].gref[ref_cnt]  = ref;
++                              ring_req->seg[ref_cnt].gref     = ref;
++                              ring_req->seg[ref_cnt].offset   = (uint16_t)off;
++                              ring_req->seg[ref_cnt].length   = (uint16_t)bytes;
++
++                              buffer_pfn++;
++                              len -= bytes;
++                              data_len -= bytes;
++                              off = 0;
++                              ref_cnt++;
++                      }
++              }
++      }
++
++big_to_sg:
++
++      gnttab_free_grant_references(gref_head);
++
++      return ref_cnt;
++}
++
++static int scsifront_queuecommand(struct Scsi_Host *shost,
++                                struct scsi_cmnd *sc)
++{
++      struct vscsifrnt_info *info = shost_priv(shost);
++      vscsiif_request_t *ring_req;
++      unsigned long flags;
++      int ref_cnt;
++      uint16_t rqid;
++
++/* debug printk to identify more missing scsi commands
++      printk(KERN_INFO "scsicmd: len=%i, 0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x",sc->cmd_len,
++              sc->cmnd[0],sc->cmnd[1],sc->cmnd[2],sc->cmnd[3],sc->cmnd[4],
++              sc->cmnd[5],sc->cmnd[6],sc->cmnd[7],sc->cmnd[8],sc->cmnd[9]);
++*/
++      spin_lock_irqsave(shost->host_lock, flags);
++      if (RING_FULL(&info->ring)) {
++              spin_unlock_irqrestore(shost->host_lock, flags);
++              return SCSI_MLQUEUE_HOST_BUSY;
++      }
++
++      sc->result    = 0;
++
++      ring_req          = scsifront_pre_request(info);
++      rqid              = ring_req->rqid;
++      ring_req->act     = VSCSIIF_ACT_SCSI_CDB;
++
++      ring_req->id      = sc->device->id;
++      ring_req->lun     = sc->device->lun;
++      ring_req->channel = sc->device->channel;
++      ring_req->cmd_len = sc->cmd_len;
++
++      BUG_ON(sc->cmd_len > VSCSIIF_MAX_COMMAND_SIZE);
++
++      if ( sc->cmd_len )
++              memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len);
++      else
++              memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE);
++
++      ring_req->sc_data_direction   = (uint8_t)sc->sc_data_direction;
++      ring_req->timeout_per_command = (sc->request->timeout / HZ);
++
++      info->shadow[rqid].req_scsi_cmnd     = (unsigned long)sc;
++      info->shadow[rqid].sc_data_direction = sc->sc_data_direction;
++      info->shadow[rqid].act               = ring_req->act;
++
++      ref_cnt = map_data_for_request(info, sc, ring_req, rqid);
++      if (ref_cnt < 0) {
++              add_id_to_freelist(info, rqid);
++              spin_unlock_irqrestore(shost->host_lock, flags);
++              if (ref_cnt == (-ENOMEM))
++                      return SCSI_MLQUEUE_HOST_BUSY;
++              sc->result = (DID_ERROR << 16);
++              sc->scsi_done(sc);
++              return 0;
++      }
++
++      ring_req->nr_segments          = (uint8_t)ref_cnt;
++      info->shadow[rqid].nr_segments = ref_cnt;
++
++      scsifront_do_request(info);
++      spin_unlock_irqrestore(shost->host_lock, flags);
++
++      return 0;
++}
++
++
++static int scsifront_eh_abort_handler(struct scsi_cmnd *sc)
++{
++      return (FAILED);
++}
++
++/* vscsi supports only device_reset, because it is each of LUNs */
++static int scsifront_dev_reset_handler(struct scsi_cmnd *sc)
++{
++      struct Scsi_Host *host = sc->device->host;
++      struct vscsifrnt_info *info = shost_priv(host);
++
++      vscsiif_request_t *ring_req;
++      uint16_t rqid;
++      int err;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)
++      spin_lock_irq(host->host_lock);
++#endif
++
++      ring_req      = scsifront_pre_request(info);
++      ring_req->act = VSCSIIF_ACT_SCSI_RESET;
++
++      rqid          = ring_req->rqid;
++      info->shadow[rqid].act = VSCSIIF_ACT_SCSI_RESET;
++
++      ring_req->channel = sc->device->channel;
++      ring_req->id      = sc->device->id;
++      ring_req->lun     = sc->device->lun;
++      ring_req->cmd_len = sc->cmd_len;
++
++      if ( sc->cmd_len )
++              memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len);
++      else
++              memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE);
++
++      ring_req->sc_data_direction   = (uint8_t)sc->sc_data_direction;
++      ring_req->timeout_per_command = (sc->request->timeout / HZ);
++      ring_req->nr_segments         = 0;
++
++      scsifront_do_request(info);     
++
++      spin_unlock_irq(host->host_lock);
++      wait_event_interruptible(info->shadow[rqid].wq_reset,
++                       info->shadow[rqid].wait_reset);
++      spin_lock_irq(host->host_lock);
++
++      err = info->shadow[rqid].rslt_reset;
++
++      add_id_to_freelist(info, rqid);
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)
++      spin_unlock_irq(host->host_lock);
++#endif
++      return (err);
++}
++
++
++struct scsi_host_template scsifront_sht = {
++      .module                 = THIS_MODULE,
++      .name                   = "Xen SCSI frontend driver",
++      .queuecommand           = scsifront_queuecommand,
++      .eh_abort_handler       = scsifront_eh_abort_handler,
++      .eh_device_reset_handler= scsifront_dev_reset_handler,
++      .cmd_per_lun            = VSCSIIF_DEFAULT_CMD_PER_LUN,
++      .can_queue              = VSCSIIF_MAX_REQS,
++      .this_id                = -1,
++      .sg_tablesize           = VSCSIIF_SG_TABLESIZE,
++      .use_clustering         = DISABLE_CLUSTERING,
++      .proc_name              = "scsifront",
++};
++
++
++static int __init scsifront_init(void)
++{
++      int err;
++
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      err = scsifront_xenbus_init();
++
++      return err;
++}
++
++static void __exit scsifront_exit(void)
++{
++      scsifront_xenbus_unregister();
++}
++
++module_init(scsifront_init);
++module_exit(scsifront_exit);
++
++MODULE_DESCRIPTION("Xen SCSI frontend driver");
++MODULE_LICENSE("GPL");
diff --cc drivers/xen/scsifront/xenbus.c

index 0000000,0000000..2a0d88e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/scsifront/xenbus.c
@@@ -1,0 -1,0 +1,426 @@@
++/*
++ * Xen SCSI frontend driver
++ *
++ * Copyright (c) 2008, FUJITSU Limited
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++/*
++* Patched to support >2TB drives
++* 2010, Samuel Kvasnica, IMS Nanofabrication AG
++*/
++
++#include <linux/version.h>
++#include <linux/slab.h>
++#include "common.h"
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
++  #define DEFAULT_TASK_COMM_LEN       16
++#else
++  #define DEFAULT_TASK_COMM_LEN       TASK_COMM_LEN
++#endif
++
++extern struct scsi_host_template scsifront_sht;
++
++static void scsifront_free(struct vscsifrnt_info *info)
++{
++      struct Scsi_Host *host = info->host;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
++      if (host->shost_state != SHOST_DEL) {
++#else
++      if (!test_bit(SHOST_DEL, &host->shost_state)) {
++#endif
++              scsi_remove_host(info->host);
++      }
++
++      if (info->ring_ref != GRANT_INVALID_REF) {
++              gnttab_end_foreign_access(info->ring_ref,
++                                      (unsigned long)info->ring.sring);
++              info->ring_ref = GRANT_INVALID_REF;
++              info->ring.sring = NULL;
++      }
++
++      if (info->irq)
++              unbind_from_irqhandler(info->irq, info);
++      info->irq = 0;
++
++      scsi_host_put(info->host);
++}
++
++
++static int scsifront_alloc_ring(struct vscsifrnt_info *info)
++{
++      struct xenbus_device *dev = info->dev;
++      struct vscsiif_sring *sring;
++      int err = -ENOMEM;
++
++
++      info->ring_ref = GRANT_INVALID_REF;
++
++      /***** Frontend to Backend ring start *****/
++      sring = (struct vscsiif_sring *) __get_free_page(GFP_KERNEL);
++      if (!sring) {
++              xenbus_dev_fatal(dev, err, "fail to allocate shared ring (Front to Back)");
++              return err;
++      }
++      SHARED_RING_INIT(sring);
++      FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
++
++      err = xenbus_grant_ring(dev, virt_to_mfn(sring));
++      if (err < 0) {
++              free_page((unsigned long) sring);
++              info->ring.sring = NULL;
++              xenbus_dev_fatal(dev, err, "fail to grant shared ring (Front to Back)");
++              goto free_sring;
++      }
++      info->ring_ref = err;
++
++      err = bind_listening_port_to_irqhandler(
++                      dev->otherend_id, scsifront_intr,
++                      IRQF_SAMPLE_RANDOM, "scsifront", info);
++
++      if (err <= 0) {
++              xenbus_dev_fatal(dev, err, "bind_listening_port_to_irqhandler");
++              goto free_sring;
++      }
++      info->irq = err;
++
++      return 0;
++
++/* free resource */
++free_sring:
++      scsifront_free(info);
++
++      return err;
++}
++
++
++static int scsifront_init_ring(struct vscsifrnt_info *info)
++{
++      struct xenbus_device *dev = info->dev;
++      struct xenbus_transaction xbt;
++      int err;
++
++      DPRINTK("%s\n",__FUNCTION__);
++
++      err = scsifront_alloc_ring(info);
++      if (err)
++              return err;
++      DPRINTK("%u %u\n", info->ring_ref, info->evtchn);
++
++again:
++      err = xenbus_transaction_start(&xbt);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "starting transaction");
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "ring-ref", "%u",
++                              info->ring_ref);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "%s", "writing ring-ref");
++              goto fail;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
++                              irq_to_evtchn_port(info->irq));
++
++      if (err) {
++              xenbus_dev_fatal(dev, err, "%s", "writing event-channel");
++              goto fail;
++      }
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err) {
++              if (err == -EAGAIN)
++                      goto again;
++              xenbus_dev_fatal(dev, err, "completing transaction");
++              goto free_sring;
++      }
++
++      return 0;
++
++fail:
++      xenbus_transaction_end(xbt, 1);
++free_sring:
++      /* free resource */
++      scsifront_free(info);
++      
++      return err;
++}
++
++
++static int scsifront_probe(struct xenbus_device *dev,
++                              const struct xenbus_device_id *id)
++{
++      struct vscsifrnt_info *info;
++      struct Scsi_Host *host;
++      int i, err = -ENOMEM;
++      char name[DEFAULT_TASK_COMM_LEN];
++
++      host = scsi_host_alloc(&scsifront_sht, sizeof(*info));
++      if (!host) {
++              xenbus_dev_fatal(dev, err, "fail to allocate scsi host");
++              return err;
++      }
++      info = (struct vscsifrnt_info *) host->hostdata;
++      info->host = host;
++
++
++      dev_set_drvdata(&dev->dev, info);
++      info->dev  = dev;
++
++      for (i = 0; i < VSCSIIF_MAX_REQS; i++) {
++              info->shadow[i].next_free = i + 1;
++              init_waitqueue_head(&(info->shadow[i].wq_reset));
++              info->shadow[i].wait_reset = 0;
++      }
++      info->shadow[VSCSIIF_MAX_REQS - 1].next_free = 0x0fff;
++
++      err = scsifront_init_ring(info);
++      if (err) {
++              scsi_host_put(host);
++              return err;
++      }
++
++      init_waitqueue_head(&info->wq);
++      spin_lock_init(&info->io_lock);
++      spin_lock_init(&info->shadow_lock);
++
++      snprintf(name, DEFAULT_TASK_COMM_LEN, "vscsiif.%d", info->host->host_no);
++
++      info->kthread = kthread_run(scsifront_schedule, info, name);
++      if (IS_ERR(info->kthread)) {
++              err = PTR_ERR(info->kthread);
++              info->kthread = NULL;
++              pr_err("scsifront: kthread start err %d\n", err);
++              goto free_sring;
++      }
++
++      host->max_id      = VSCSIIF_MAX_TARGET;
++      host->max_channel = 0;
++      host->max_lun     = VSCSIIF_MAX_LUN;
++      host->max_sectors = (VSCSIIF_SG_TABLESIZE - 1) * PAGE_SIZE / 512;
++      host->max_cmd_len = VSCSIIF_MAX_COMMAND_SIZE;
++
++      err = scsi_add_host(host, &dev->dev);
++      if (err) {
++              pr_err("scsifront: fail to add scsi host %d\n", err);
++              goto free_sring;
++      }
++
++      xenbus_switch_state(dev, XenbusStateInitialised);
++
++      return 0;
++
++free_sring:
++      /* free resource */
++      scsifront_free(info);
++      return err;
++}
++
++static int scsifront_remove(struct xenbus_device *dev)
++{
++      struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev);
++
++      DPRINTK("%s: %s removed\n",__FUNCTION__ ,dev->nodename);
++
++      if (info->kthread) {
++              kthread_stop(info->kthread);
++              info->kthread = NULL;
++      }
++
++      scsifront_free(info);
++      
++      return 0;
++}
++
++
++static int scsifront_disconnect(struct vscsifrnt_info *info)
++{
++      struct xenbus_device *dev = info->dev;
++      struct Scsi_Host *host = info->host;
++
++      DPRINTK("%s: %s disconnect\n",__FUNCTION__ ,dev->nodename);
++
++      /* 
++        When this function is executed,  all devices of 
++        Frontend have been deleted. 
++        Therefore, it need not block I/O before remove_host.
++      */
++
++      scsi_remove_host(host);
++      xenbus_frontend_closed(dev);
++
++      return 0;
++}
++
++#define VSCSIFRONT_OP_ADD_LUN 1
++#define VSCSIFRONT_OP_DEL_LUN 2
++
++static void scsifront_do_lun_hotplug(struct vscsifrnt_info *info, int op)
++{
++      struct xenbus_device *dev = info->dev;
++      int i, err = 0;
++      char str[64], state_str[64];
++      char **dir;
++      unsigned int dir_n = 0;
++      unsigned int device_state;
++      unsigned int hst, chn, tgt, lun;
++      struct scsi_device *sdev;
++
++      dir = xenbus_directory(XBT_NIL, dev->otherend, "vscsi-devs", &dir_n);
++      if (IS_ERR(dir))
++              return;
++
++      for (i = 0; i < dir_n; i++) {
++              /* read status */
++              snprintf(str, sizeof(str), "vscsi-devs/%s/state", dir[i]);
++              err = xenbus_scanf(XBT_NIL, dev->otherend, str, "%u",
++                      &device_state);
++              if (XENBUS_EXIST_ERR(err))
++                      continue;
++              
++              /* virtual SCSI device */
++              snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]);
++              err = xenbus_scanf(XBT_NIL, dev->otherend, str,
++                      "%u:%u:%u:%u", &hst, &chn, &tgt, &lun);
++              if (XENBUS_EXIST_ERR(err))
++                      continue;
++
++              /* front device state path */
++              snprintf(state_str, sizeof(state_str), "vscsi-devs/%s/state", dir[i]);
++
++              switch (op) {
++              case VSCSIFRONT_OP_ADD_LUN:
++                      if (device_state == XenbusStateInitialised) {
++                              sdev = scsi_device_lookup(info->host, chn, tgt, lun);
++                              if (sdev) {
++                                      pr_err("scsifront: Device already in use.\n");
++                                      scsi_device_put(sdev);
++                                      xenbus_printf(XBT_NIL, dev->nodename,
++                                              state_str, "%d", XenbusStateClosed);
++                              } else {
++                                      scsi_add_device(info->host, chn, tgt, lun);
++                                      xenbus_printf(XBT_NIL, dev->nodename,
++                                              state_str, "%d", XenbusStateConnected);
++                              }
++                      }
++                      break;
++              case VSCSIFRONT_OP_DEL_LUN:
++                      if (device_state == XenbusStateClosing) {
++                              sdev = scsi_device_lookup(info->host, chn, tgt, lun);
++                              if (sdev) {
++                                      scsi_remove_device(sdev);
++                                      scsi_device_put(sdev);
++                                      xenbus_printf(XBT_NIL, dev->nodename,
++                                              state_str, "%d", XenbusStateClosed);
++                              }
++                      }
++                      break;
++              default:
++                      break;
++              }
++      }
++      
++      kfree(dir);
++      return;
++}
++
++
++
++
++static void scsifront_backend_changed(struct xenbus_device *dev,
++                              enum xenbus_state backend_state)
++{
++      struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev);
++
++      DPRINTK("%p %u %u\n", dev, dev->state, backend_state);
++
++      switch (backend_state) {
++      case XenbusStateUnknown:
++      case XenbusStateInitialising:
++      case XenbusStateInitWait:
++      case XenbusStateClosed:
++              break;
++
++      case XenbusStateInitialised:
++              break;
++
++      case XenbusStateConnected:
++              if (xenbus_read_driver_state(dev->nodename) ==
++                      XenbusStateInitialised) {
++                      scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN);
++              }
++              
++              if (dev->state == XenbusStateConnected)
++                      break;
++                      
++              xenbus_switch_state(dev, XenbusStateConnected);
++              break;
++
++      case XenbusStateClosing:
++              scsifront_disconnect(info);
++              break;
++
++      case XenbusStateReconfiguring:
++              scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_DEL_LUN);
++              xenbus_switch_state(dev, XenbusStateReconfiguring);
++              break;
++
++      case XenbusStateReconfigured:
++              scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN);
++              xenbus_switch_state(dev, XenbusStateConnected);
++              break;
++      }
++}
++
++
++static const struct xenbus_device_id scsifront_ids[] = {
++      { "vscsi" },
++      { "" }
++};
++MODULE_ALIAS("xen:vscsi");
++
++static struct xenbus_driver scsifront_driver = {
++      .name                   = "vscsi",
++      .ids                    = scsifront_ids,
++      .probe                  = scsifront_probe,
++      .remove                 = scsifront_remove,
++/*    .resume                 = scsifront_resume, */
++      .otherend_changed       = scsifront_backend_changed,
++};
++
++int scsifront_xenbus_init(void)
++{
++      return xenbus_register_frontend(&scsifront_driver);
++}
++
++void scsifront_xenbus_unregister(void)
++{
++      xenbus_unregister_driver(&scsifront_driver);
++}
++
diff --cc drivers/xen/sfc_netback/Makefile

index 0000000,0000000..1286c3a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/Makefile
@@@ -1,0 -1,0 +1,12 @@@
++EXTRA_CFLAGS += -Idrivers/xen/sfc_netback -Idrivers/xen/sfc_netutil -Idrivers/xen/netback -Idrivers/net/sfc -Idrivers/net/sfc/sfc_resource
++EXTRA_CFLAGS += -D__ci_driver__ 
++EXTRA_CFLAGS += -DEFX_USE_KCOMPAT
++EXTRA_CFLAGS += -Werror
++
++ifdef GCOV
++EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
++endif
++
++obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND) := sfc_netback.o
++
++sfc_netback-objs   := accel.o accel_fwd.o accel_msg.o accel_solarflare.o accel_xenbus.o accel_debugfs.o
diff --cc drivers/xen/sfc_netback/accel.c

index 0000000,0000000..21367f2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel.c
@@@ -1,0 -1,0 +1,147 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include "accel.h"
++#include "accel_msg_iface.h"
++#include "accel_solarflare.h"
++
++#include <linux/notifier.h>
++
++#ifdef EFX_GCOV
++#include "gcov.h"
++#endif
++
++static int netback_accel_netdev_event(struct notifier_block *nb,
++                                    unsigned long event, void *ptr)
++{
++      struct net_device *net_dev = (struct net_device *)ptr;
++      struct netback_accel *bend;
++
++      if ((event == NETDEV_UP) || 
++          (event == NETDEV_DOWN) ||
++          (event == NETDEV_CHANGE)) {
++              mutex_lock(&bend_list_mutex);
++              bend = bend_list;
++              while (bend != NULL) {
++                      mutex_lock(&bend->bend_mutex);
++                      /*
++                       * This happens when the shared pages have
++                       * been unmapped, but the bend not yet removed
++                       * from list
++                       */
++                      if (bend->shared_page == NULL)
++                              goto next;
++
++                      if (bend->net_dev->ifindex == net_dev->ifindex) {
++                              int ok;
++                              if (event == NETDEV_CHANGE)
++                                      ok = (netif_carrier_ok(net_dev) && 
++                                            (net_dev->flags & IFF_UP));
++                              else
++                                      ok = (netif_carrier_ok(net_dev) && 
++                                            (event == NETDEV_UP));
++                              netback_accel_set_interface_state(bend, ok);
++                      }
++
++              next:
++                      mutex_unlock(&bend->bend_mutex);
++                      bend = bend->next_bend;
++              }
++              mutex_unlock(&bend_list_mutex);
++      }
++
++      return NOTIFY_DONE;
++}
++
++
++static struct notifier_block netback_accel_netdev_notifier = {
++      .notifier_call = netback_accel_netdev_event,
++};
++
++
++unsigned sfc_netback_max_pages = NETBACK_ACCEL_DEFAULT_MAX_BUF_PAGES;
++module_param_named(max_pages, sfc_netback_max_pages, uint, 0644);
++MODULE_PARM_DESC(max_pages, 
++               "The number of buffer pages to enforce on each guest");
++
++/* Initialise subsystems need for the accelerated fast path */
++static int __init netback_accel_init(void)
++{
++      int rc = 0;
++
++#ifdef EFX_GCOV
++      gcov_provider_init(THIS_MODULE);
++#endif
++
++      rc = netback_accel_init_fwd();
++      if (rc != 0)
++              goto fail0;
++
++      netback_accel_debugfs_init();
++
++      rc = netback_accel_sf_init();
++      if (rc != 0)
++              goto fail1;
++
++      rc = register_netdevice_notifier
++              (&netback_accel_netdev_notifier);
++      if (rc != 0)
++              goto fail2;
++
++      return 0;
++
++ fail2:
++      netback_accel_sf_shutdown();
++ fail1:
++      netback_accel_debugfs_fini();
++      netback_accel_shutdown_fwd();
++ fail0:
++#ifdef EFX_GCOV
++      gcov_provider_fini(THIS_MODULE);
++#endif
++      return rc;
++}
++
++module_init(netback_accel_init);
++
++static void __exit netback_accel_exit(void)
++{
++      unregister_netdevice_notifier(&netback_accel_netdev_notifier);
++
++      netback_accel_sf_shutdown();
++
++      netback_accel_shutdown_bends();
++
++      netback_accel_debugfs_fini();
++
++      netback_accel_shutdown_fwd();
++
++#ifdef EFX_GCOV
++      gcov_provider_fini(THIS_MODULE);
++#endif
++}
++
++module_exit(netback_accel_exit);
++
++MODULE_LICENSE("GPL");
diff --cc drivers/xen/sfc_netback/accel.h

index 0000000,0000000..f371a3e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel.h
@@@ -1,0 -1,0 +1,392 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#ifndef NETBACK_ACCEL_H
++#define NETBACK_ACCEL_H
++
++#include <linux/version.h>
++#include <linux/slab.h>
++#include <linux/ip.h>
++#include <linux/tcp.h>
++#include <linux/udp.h>
++#include <linux/in.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/mutex.h>
++#include <linux/wait.h>
++
++#include <xen/xenbus.h>
++
++#include "accel_shared_fifo.h"
++#include "accel_msg_iface.h"
++#include "accel_util.h"
++
++/**************************************************************************
++ * Datatypes
++ **************************************************************************/
++
++#define NETBACK_ACCEL_DEFAULT_MAX_FILTERS (8)
++#define NETBACK_ACCEL_DEFAULT_MAX_MCASTS (8)
++#define NETBACK_ACCEL_DEFAULT_MAX_BUF_PAGES (384)
++/* Variable to store module parameter for max_buf_pages */
++extern unsigned sfc_netback_max_pages;
++
++#define NETBACK_ACCEL_STATS 1
++
++#if NETBACK_ACCEL_STATS
++#define NETBACK_ACCEL_STATS_OP(x) x
++#else
++#define NETBACK_ACCEL_STATS_OP(x)
++#endif
++
++/*! Statistics for a given backend */
++struct netback_accel_stats {
++      /*! Number of eventq wakeup events */
++      u64 evq_wakeups;
++      /*! Number of eventq timeout events */
++      u64 evq_timeouts;
++      /*! Number of filters used */
++      u32 num_filters;
++      /*! Number of buffer pages registered */
++      u32 num_buffer_pages;
++};
++
++
++/* Debug fs nodes for each of the above stats */
++struct netback_accel_dbfs {
++      struct dentry *evq_wakeups;
++      struct dentry *evq_timeouts;
++      struct dentry *num_filters;
++      struct dentry *num_buffer_pages;
++};
++
++
++/*! Resource limits for a given NIC */
++struct netback_accel_limits {
++      int max_filters;            /*!< Max. number of filters to use. */
++      int max_mcasts;      /*!< Max. number  of mcast subscriptions */
++      int max_buf_pages;        /*!< Max. number of pages of NIC buffers */
++};
++
++
++/*! The state for an instance of the back end driver. */
++struct netback_accel {
++      /*! mutex to protect this state */
++      struct mutex bend_mutex;
++
++      /*! Watches on xenstore */
++      struct xenbus_watch domu_accel_watch;
++      struct xenbus_watch config_accel_watch;
++
++      /*! Pointer to whatever device cookie ties us in to the hypervisor */
++      void *hdev_data;
++
++      /*! FIFO indices. Next page is msg FIFOs */
++      struct net_accel_shared_page *shared_page;
++
++      /*! Defer control message processing */
++      struct work_struct handle_msg;
++
++      /*! Identifies other end VM and interface.*/
++      int far_end;
++      int vif_num;
++
++      /*!< To unmap the shared pages */
++      void *sh_pages_unmap;
++
++      /* Resource tracking */
++      /*! Limits on H/W & Dom0 resources */
++      struct netback_accel_limits quotas;
++
++      /* Hardware resources */
++      /*! The H/W type of associated NIC */
++      enum net_accel_hw_type hw_type;
++      /*! State of allocation */             
++      int hw_state;
++      /*! How to set up the acceleration for this hardware */
++      int (*accel_setup)(struct netback_accel *); 
++      /*! And how to stop it. */
++      void (*accel_shutdown)(struct netback_accel *);
++
++      /*! The physical/real net_dev for this interface */
++      struct net_device *net_dev;
++
++      /*! Magic pointer to locate state in fowarding table */
++      void *fwd_priv;
++
++      /*! Message FIFO */
++      sh_msg_fifo2 to_domU;
++      /*! Message FIFO */
++      sh_msg_fifo2 from_domU;
++
++      /*! General notification channel id */
++      int msg_channel;
++      /*! General notification channel irq */
++      int msg_channel_irq;
++
++      /*! Event channel id dedicated to network packet interrupts. */
++      int net_channel; 
++      /*! Event channel irq dedicated to network packets interrupts */
++      int net_channel_irq; 
++
++      /*! The MAC address the frontend goes by. */
++      u8 mac[ETH_ALEN];
++      /*! Driver name of associated NIC */
++      char *nicname;    
++
++      /*! Array of pointers to buffer pages mapped */
++      grant_handle_t *buffer_maps; 
++      u64 *buffer_addrs;
++      /*! Index into buffer_maps */
++      int buffer_maps_index; 
++      /*! Max number of pages that domU is allowed/will request to map */
++      int max_pages; 
++
++      /*! Pointer to hardware specific private area */
++      void *accel_hw_priv; 
++
++      /*! Wait queue for changes in accelstate. */
++      wait_queue_head_t state_wait_queue;
++
++      /*! Current state of the frontend according to the xenbus
++       *  watch. */
++      XenbusState frontend_state;
++
++      /*! Current state of this backend. */
++      XenbusState backend_state;
++
++      /*! Non-zero if the backend is being removed. */
++      int removing;
++
++      /*! Non-zero if the setup_vnic has been called. */
++      int vnic_is_setup;
++
++#if NETBACK_ACCEL_STATS
++      struct netback_accel_stats stats;
++#endif        
++#if defined(CONFIG_DEBUG_FS)
++      char *dbfs_dir_name;
++      struct dentry *dbfs_dir;
++      struct netback_accel_dbfs dbfs;
++#endif
++
++      /*! List */
++      struct netback_accel *next_bend;
++};
++
++
++/*
++ * Values for netback_accel.hw_state.  States of resource allocation
++ * we can go through
++ */
++/*! No hardware has yet been allocated. */
++#define NETBACK_ACCEL_RES_NONE  (0)
++/*! Hardware has been allocated. */
++#define NETBACK_ACCEL_RES_ALLOC (1)
++#define NETBACK_ACCEL_RES_FILTER (2)
++#define NETBACK_ACCEL_RES_HWINFO (3)
++
++/*! Filtering specification. This assumes that for VNIC support we
++ *  will always want wildcard entries, so only specifies the
++ *  destination IP/port
++ */
++struct netback_accel_filter_spec {
++      /*! Internal, used to access efx_vi API */
++      void *filter_handle; 
++
++      /*! Destination IP in network order */
++      u32 destip_be;
++      /*! Destination port in network order */
++      u16 destport_be;
++      /*! Mac address */
++      u8  mac[ETH_ALEN];
++      /*! TCP or UDP */
++      u8  proto;      
++};
++
++
++/**************************************************************************
++ * From accel.c
++ **************************************************************************/
++
++/*! \brief Start up all the acceleration plugins 
++ *
++ * \return 0 on success, an errno on failure
++ */
++extern int netback_accel_init_accel(void);
++
++/*! \brief Shut down all the acceleration plugins 
++ */
++extern void netback_accel_shutdown_accel(void);
++
++
++/**************************************************************************
++ * From accel_fwd.c
++ **************************************************************************/
++
++/*! \brief Init the forwarding infrastructure
++ * \return 0 on success, or -ENOMEM if it couldn't get memory for the
++ * forward table 
++ */
++extern int netback_accel_init_fwd(void);
++
++/*! \brief Shut down the forwarding and free memory. */
++extern void netback_accel_shutdown_fwd(void);
++
++/*! Initialise each nic port's fowarding table */
++extern void *netback_accel_init_fwd_port(void);
++extern void netback_accel_shutdown_fwd_port(void *fwd_priv);
++
++/*! \brief Add an entry to the forwarding table. 
++ * \param mac : MAC address, used as hash key
++ * \param ctxt : value to associate with key (can be NULL, see
++ * netback_accel_fwd_set_context)
++ * \return 0 on success, -ENOMEM if table was full and could no grow it
++ */
++extern int netback_accel_fwd_add(const __u8 *mac, void *context,
++                               void *fwd_priv);
++
++/*! \brief Remove an entry from the forwarding table. 
++ * \param mac : the MAC address to remove
++ * \return nothing: it is not an error if the mac was not in the table
++ */
++extern void netback_accel_fwd_remove(const __u8 *mac, void *fwd_priv);
++
++/*! \brief Set the context pointer for an existing fwd table entry.
++ * \param mac : key that is already present in the table
++ * \param context : new value to associate with key
++ * \return 0 on success, -ENOENT if mac not present in table.
++ */
++extern int netback_accel_fwd_set_context(const __u8 *mac, void *context,
++                                       void *fwd_priv);
++
++/**************************************************************************
++ * From accel_msg.c
++ **************************************************************************/
++
++
++/*! \brief Send the start-of-day message that handshakes with the VNIC
++ *  and tells it its MAC address.
++ *
++ * \param bend The back end driver data structure
++ * \param version The version of communication to use, e.g. NET_ACCEL_MSG_VERSION
++ */
++extern void netback_accel_msg_tx_hello(struct netback_accel *bend,
++                                     unsigned version);
++
++/*! \brief Send a "there's a new local mac address" message 
++ *
++ * \param bend The back end driver data structure for the vnic to send
++ * the message to 
++ * \param mac Pointer to the new mac address
++ */
++extern void netback_accel_msg_tx_new_localmac(struct netback_accel *bend,
++                                            const void *mac);
++
++/*! \brief Send a "a mac address that was local has gone away" message 
++ *
++ * \param bend The back end driver data structure for the vnic to send
++ * the message to 
++ * \param mac Pointer to the old mac address
++ */
++extern void netback_accel_msg_tx_old_localmac(struct netback_accel *bend,
++                                            const void *mac);
++
++extern void netback_accel_set_interface_state(struct netback_accel *bend,
++                                            int up);
++
++/*! \brief Process the message queue for a bend that has just
++ * interrupted.
++ * 
++ * Demultiplexs an interrupt from the front end driver, taking
++ * messages from the fifo and taking appropriate action.
++ * 
++ * \param bend The back end driver data structure
++ */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
++extern void netback_accel_msg_rx_handler(struct work_struct *arg);
++#else
++extern void netback_accel_msg_rx_handler(void *bend_void);
++#endif
++
++/**************************************************************************
++ * From accel_xenbus.c
++ **************************************************************************/
++/*! List of all the bends currently in existence. */
++extern struct netback_accel *bend_list;
++extern struct mutex bend_list_mutex;
++
++/*! \brief Probe a new network interface. */
++extern int netback_accel_probe(struct xenbus_device *dev);
++
++/*! \brief Remove a network interface. */
++extern int netback_accel_remove(struct xenbus_device *dev);
++
++/*! \brief Shutdown all accelerator backends */
++extern void netback_accel_shutdown_bends(void);
++
++/*! \brief Initiate the xenbus state teardown handshake */
++extern void netback_accel_set_closing(struct netback_accel *bend);
++
++/**************************************************************************
++ * From accel_debugfs.c
++ **************************************************************************/
++/*! Global statistics */
++struct netback_accel_global_stats {
++      /*! Number of TX packets seen through driverlink */
++      u64 dl_tx_packets;
++      /*! Number of TX packets seen through driverlink we didn't like */
++      u64 dl_tx_bad_packets;
++      /*! Number of RX packets seen through driverlink */
++      u64 dl_rx_packets;
++      /*! Number of mac addresses we are forwarding to */
++      u32 num_fwds;
++};
++
++/*! Debug fs entries for each of the above stats */
++struct netback_accel_global_dbfs {
++      struct dentry *dl_tx_packets;
++      struct dentry *dl_tx_bad_packets;
++      struct dentry *dl_rx_packets;
++      struct dentry *num_fwds;
++};
++
++#if NETBACK_ACCEL_STATS
++extern struct netback_accel_global_stats global_stats;
++#endif
++
++/*! \brief Initialise the debugfs root and populate with global stats */
++extern void netback_accel_debugfs_init(void);
++
++/*! \brief Remove our debugfs root directory */
++extern void netback_accel_debugfs_fini(void);
++
++/*! \brief Add per-bend statistics to debug fs */
++extern int netback_accel_debugfs_create(struct netback_accel *bend);
++/*! \brief Remove per-bend statistics from debug fs */
++extern int netback_accel_debugfs_remove(struct netback_accel *bend);
++
++#endif /* NETBACK_ACCEL_H */
++
++
diff --cc drivers/xen/sfc_netback/accel_debugfs.c

index 0000000,0000000..6527c4b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_debugfs.c
@@@ -1,0 -1,0 +1,148 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <linux/fs.h>
++#include <linux/debugfs.h>
++
++#include "accel.h"
++
++#if defined(CONFIG_DEBUG_FS)
++static struct dentry *sfc_debugfs_root = NULL;
++#endif
++
++#if NETBACK_ACCEL_STATS
++struct netback_accel_global_stats global_stats;
++#if defined(CONFIG_DEBUG_FS)
++static struct netback_accel_global_dbfs  global_dbfs;
++#endif
++#endif
++
++void netback_accel_debugfs_init(void) 
++{
++#if defined(CONFIG_DEBUG_FS)
++      sfc_debugfs_root = debugfs_create_dir("sfc_netback", NULL);
++      if (sfc_debugfs_root == NULL)
++              return;
++
++      global_dbfs.num_fwds = debugfs_create_u32
++              ("num_fwds", S_IRUSR | S_IRGRP | S_IROTH,
++               sfc_debugfs_root, &global_stats.num_fwds);
++      global_dbfs.dl_tx_packets = debugfs_create_u64
++              ("dl_tx_packets", S_IRUSR | S_IRGRP | S_IROTH,
++               sfc_debugfs_root, &global_stats.dl_tx_packets);
++      global_dbfs.dl_rx_packets = debugfs_create_u64
++              ("dl_rx_packets", S_IRUSR | S_IRGRP | S_IROTH,
++               sfc_debugfs_root, &global_stats.dl_rx_packets);
++      global_dbfs.dl_tx_bad_packets = debugfs_create_u64
++              ("dl_tx_bad_packets", S_IRUSR | S_IRGRP | S_IROTH,
++               sfc_debugfs_root, &global_stats.dl_tx_bad_packets);
++#endif
++}
++
++
++void netback_accel_debugfs_fini(void)
++{
++#if defined(CONFIG_DEBUG_FS)
++      debugfs_remove(global_dbfs.num_fwds);
++      debugfs_remove(global_dbfs.dl_tx_packets);
++      debugfs_remove(global_dbfs.dl_rx_packets);
++      debugfs_remove(global_dbfs.dl_tx_bad_packets);
++
++      debugfs_remove(sfc_debugfs_root);
++#endif
++}
++
++
++int netback_accel_debugfs_create(struct netback_accel *bend)
++{
++#if defined(CONFIG_DEBUG_FS)
++      /* Smallest length is 7 (vif0.0\n) */
++      int length = 7, temp;
++
++      if (sfc_debugfs_root == NULL)
++              return -ENOENT;
++
++      /* Work out length of string representation of far_end and vif_num */
++      temp = bend->far_end;
++      while (temp > 9) {
++              length++;
++              temp = temp / 10;
++      }
++      temp = bend->vif_num;
++      while (temp > 9) {
++              length++;
++              temp = temp / 10;
++      }
++
++      bend->dbfs_dir_name = kmalloc(length, GFP_KERNEL);
++      if (bend->dbfs_dir_name == NULL)
++              return -ENOMEM;
++      sprintf(bend->dbfs_dir_name, "vif%d.%d", bend->far_end, bend->vif_num);
++
++      bend->dbfs_dir = debugfs_create_dir(bend->dbfs_dir_name, 
++                                          sfc_debugfs_root);
++      if (bend->dbfs_dir == NULL) {
++              kfree(bend->dbfs_dir_name);
++              return -ENOMEM;
++      }
++
++#if NETBACK_ACCEL_STATS
++      bend->dbfs.evq_wakeups = debugfs_create_u64
++              ("evq_wakeups", S_IRUSR | S_IRGRP | S_IROTH,
++               bend->dbfs_dir, &bend->stats.evq_wakeups);
++      bend->dbfs.evq_timeouts = debugfs_create_u64
++              ("evq_timeouts", S_IRUSR | S_IRGRP | S_IROTH,
++               bend->dbfs_dir, &bend->stats.evq_timeouts);
++      bend->dbfs.num_filters = debugfs_create_u32
++              ("num_filters", S_IRUSR | S_IRGRP | S_IROTH,
++               bend->dbfs_dir, &bend->stats.num_filters);
++      bend->dbfs.num_buffer_pages = debugfs_create_u32
++              ("num_buffer_pages", S_IRUSR | S_IRGRP | S_IROTH,
++               bend->dbfs_dir, &bend->stats.num_buffer_pages);
++#endif
++#endif
++        return 0;
++}
++
++
++int netback_accel_debugfs_remove(struct netback_accel *bend)
++{
++#if defined(CONFIG_DEBUG_FS)
++      if (bend->dbfs_dir != NULL) {
++#if NETBACK_ACCEL_STATS
++              debugfs_remove(bend->dbfs.evq_wakeups);
++              debugfs_remove(bend->dbfs.evq_timeouts);
++              debugfs_remove(bend->dbfs.num_filters);
++              debugfs_remove(bend->dbfs.num_buffer_pages);
++#endif
++              debugfs_remove(bend->dbfs_dir);
++      }
++
++      if (bend->dbfs_dir_name)
++              kfree(bend->dbfs_dir_name);
++#endif
++        return 0;
++}
++
++
diff --cc drivers/xen/sfc_netback/accel_fwd.c

index 0000000,0000000..385855a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_fwd.c
@@@ -1,0 -1,0 +1,420 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include "accel.h"
++#include "accel_cuckoo_hash.h"
++#include "accel_util.h"
++#include "accel_solarflare.h"
++
++#include "driverlink_api.h"
++
++#include <linux/if_arp.h>
++#include <linux/skbuff.h>
++#include <linux/list.h>
++
++/* State stored in the forward table */
++struct fwd_struct {
++      struct list_head link; /* Forms list */
++      void * context;
++      __u8 valid;
++      __u8 mac[ETH_ALEN];
++};
++
++/* Max value we support */
++#define NUM_FWDS_BITS 8
++#define NUM_FWDS (1 << NUM_FWDS_BITS)
++#define FWD_MASK (NUM_FWDS - 1)
++
++struct port_fwd {
++      /* Make a list */
++      struct list_head link;
++      /* Hash table to store the fwd_structs */
++      cuckoo_hash_table fwd_hash_table;
++      /* The array of fwd_structs */
++      struct fwd_struct *fwd_array;
++      /* Linked list of entries in use. */
++      struct list_head fwd_list;
++      /* Could do something clever with a reader/writer lock. */
++      spinlock_t fwd_lock;
++      /* Make find_free_entry() a bit faster by caching this */
++      int last_free_index;
++};
++
++/*
++ * This is unlocked as it's only called from dl probe and remove,
++ * which are themselves synchronised.  Could get rid of it entirely as
++ * it's never iterated, but useful for debug
++ */
++static struct list_head port_fwds;
++
++
++/* Search the fwd_array for an unused entry */
++static int fwd_find_free_entry(struct port_fwd *fwd_set)
++{
++      int index = fwd_set->last_free_index;
++
++      do {
++              if (!fwd_set->fwd_array[index].valid) {
++                      fwd_set->last_free_index = index;
++                      return index;
++              }
++              index++;
++              if (index >= NUM_FWDS)
++                      index = 0;
++      } while (index != fwd_set->last_free_index);
++
++      return -ENOMEM;
++}
++
++
++/* Look up a MAC in the hash table. Caller should hold table lock. */
++static inline struct fwd_struct *fwd_find_entry(const __u8 *mac,
++                                              struct port_fwd *fwd_set)
++{
++      cuckoo_hash_value value;
++      cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
++
++      if (cuckoo_hash_lookup(&fwd_set->fwd_hash_table,
++                             (cuckoo_hash_key *)(&key),
++                             &value)) {
++              struct fwd_struct *fwd = &fwd_set->fwd_array[value];
++              DPRINTK_ON(memcmp(fwd->mac, mac, ETH_ALEN) != 0);
++              return fwd;
++      }
++
++      return NULL;
++}
++
++
++/* Initialise each nic port's fowarding table */
++void *netback_accel_init_fwd_port(void) 
++{     
++      struct port_fwd *fwd_set;
++
++      fwd_set = kzalloc(sizeof(struct port_fwd), GFP_KERNEL);
++      if (fwd_set == NULL) {
++              return NULL;
++      }
++
++      spin_lock_init(&fwd_set->fwd_lock);
++      
++      fwd_set->fwd_array = kzalloc(sizeof (struct fwd_struct) * NUM_FWDS,
++                                   GFP_KERNEL);
++      if (fwd_set->fwd_array == NULL) {
++              kfree(fwd_set);
++              return NULL;
++      }
++      
++      if (cuckoo_hash_init(&fwd_set->fwd_hash_table, NUM_FWDS_BITS, 8) != 0) {
++              kfree(fwd_set->fwd_array);
++              kfree(fwd_set);
++              return NULL;
++      }
++      
++      INIT_LIST_HEAD(&fwd_set->fwd_list);
++      
++      list_add(&fwd_set->link, &port_fwds);
++
++      return fwd_set;
++}
++
++
++void netback_accel_shutdown_fwd_port(void *fwd_priv)
++{
++      struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
++
++      BUG_ON(fwd_priv == NULL);
++      
++      BUG_ON(list_empty(&port_fwds));
++      list_del(&fwd_set->link);
++
++      BUG_ON(!list_empty(&fwd_set->fwd_list));
++
++      cuckoo_hash_destroy(&fwd_set->fwd_hash_table);
++      kfree(fwd_set->fwd_array);
++      kfree(fwd_set);
++}
++
++
++int netback_accel_init_fwd()
++{
++      INIT_LIST_HEAD(&port_fwds);
++      return 0;
++}
++
++
++void netback_accel_shutdown_fwd()
++{
++      BUG_ON(!list_empty(&port_fwds));
++}
++
++
++/*
++ * Add an entry to the forwarding table.  Returns -ENOMEM if no
++ * space.
++ */
++int netback_accel_fwd_add(const __u8 *mac, void *context, void *fwd_priv)
++{
++      struct fwd_struct *fwd;
++      int rc = 0, index;
++      unsigned long flags;
++      cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
++      struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
++
++      BUG_ON(fwd_priv == NULL);
++
++      DPRINTK("Adding mac %pM\n", mac);
++       
++      spin_lock_irqsave(&fwd_set->fwd_lock, flags);
++      
++      if ((rc = fwd_find_free_entry(fwd_set)) < 0 ) {
++              spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
++              return rc;
++      }
++
++      index = rc;
++
++      /* Shouldn't already be in the table */
++      if (cuckoo_hash_lookup(&fwd_set->fwd_hash_table,
++                             (cuckoo_hash_key *)(&key), &rc) != 0) {
++              spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
++              EPRINTK("MAC address %pM already accelerated.\n", mac);
++              return -EEXIST;
++      }
++
++      if ((rc = cuckoo_hash_add(&fwd_set->fwd_hash_table,
++                                (cuckoo_hash_key *)(&key), index, 1)) == 0) {
++              fwd = &fwd_set->fwd_array[index];
++              fwd->valid = 1;
++              fwd->context = context;
++              memcpy(fwd->mac, mac, ETH_ALEN);
++              list_add(&fwd->link, &fwd_set->fwd_list);
++              NETBACK_ACCEL_STATS_OP(global_stats.num_fwds++);
++      }
++
++      spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
++
++      /*
++       * No need to tell frontend that this mac address is local -
++       * it should auto-discover through packets on fastpath what is
++       * local and what is not, and just being on same server
++       * doesn't make it local (it could be on a different
++       * bridge)
++       */
++
++      return rc;
++}
++
++
++/* remove an entry from the forwarding tables. */
++void netback_accel_fwd_remove(const __u8 *mac, void *fwd_priv)
++{
++      struct fwd_struct *fwd;
++      unsigned long flags;
++      cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
++      struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
++
++      DPRINTK("Removing mac %pM\n", mac);
++
++      BUG_ON(fwd_priv == NULL);
++
++      spin_lock_irqsave(&fwd_set->fwd_lock, flags);
++
++      fwd = fwd_find_entry(mac, fwd_set);
++      if (fwd != NULL) {
++              BUG_ON(list_empty(&fwd_set->fwd_list));
++              list_del(&fwd->link);
++
++              fwd->valid = 0;
++              cuckoo_hash_remove(&fwd_set->fwd_hash_table, 
++                                 (cuckoo_hash_key *)(&key));
++              NETBACK_ACCEL_STATS_OP(global_stats.num_fwds--);
++      }
++      spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
++
++      /*
++       * No need to tell frontend that this is no longer present -
++       * the frontend is currently only interested in remote
++       * addresses and it works these out (mostly) by itself
++       */
++}
++
++
++/* Set the context pointer for a hash table entry. */
++int netback_accel_fwd_set_context(const __u8 *mac, void *context, 
++                                void *fwd_priv)
++{
++      struct fwd_struct *fwd;
++      unsigned long flags;
++      int rc = -ENOENT;
++      struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
++
++      BUG_ON(fwd_priv == NULL);
++
++      spin_lock_irqsave(&fwd_set->fwd_lock, flags);
++      fwd = fwd_find_entry(mac, fwd_set);
++      if (fwd != NULL) {
++              fwd->context = context;
++              rc = 0;
++      }
++      spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
++      return rc;
++}
++
++
++/**************************************************************************
++ * Process a received packet
++ **************************************************************************/
++
++/*
++ * Returns whether or not we have a match in our forward table for the
++ * this skb. Must be called with appropriate fwd_lock already held
++ */
++static struct netback_accel *for_a_vnic(struct netback_pkt_buf *skb, 
++                                      struct port_fwd *fwd_set)
++{
++      struct fwd_struct *fwd;
++      struct netback_accel *retval = NULL;
++
++      fwd = fwd_find_entry(skb->mac.raw, fwd_set);
++      if (fwd != NULL)
++              retval = fwd->context;
++      return retval;
++}
++
++
++static inline int packet_is_arp_reply(struct sk_buff *skb)
++{
++      return skb->protocol == ntohs(ETH_P_ARP) 
++              && arp_hdr(skb)->ar_op == ntohs(ARPOP_REPLY);
++}
++
++
++static inline void hdr_to_filt(struct ethhdr *ethhdr, struct iphdr *ip,
++                             struct netback_accel_filter_spec *spec)
++{
++      spec->proto = ip->protocol;
++      spec->destip_be = ip->daddr;
++      memcpy(spec->mac, ethhdr->h_source, ETH_ALEN);
++
++      if (ip->protocol == IPPROTO_TCP) {
++              struct tcphdr *tcp = (struct tcphdr *)((char *)ip + 4 * ip->ihl);
++              spec->destport_be = tcp->dest;
++      } else {
++              struct udphdr *udp = (struct udphdr *)((char *)ip + 4 * ip->ihl);
++              EPRINTK_ON(ip->protocol != IPPROTO_UDP);
++              spec->destport_be = udp->dest;
++      }
++}
++
++
++static inline int netback_accel_can_filter(struct netback_pkt_buf *skb) 
++{
++      return (skb->protocol == htons(ETH_P_IP) && 
++              ((skb->nh.iph->protocol == IPPROTO_TCP) ||
++               (skb->nh.iph->protocol == IPPROTO_UDP)));
++}
++
++
++static inline void netback_accel_filter_packet(struct netback_accel *bend,
++                                             struct netback_pkt_buf *skb)
++{
++      struct netback_accel_filter_spec fs;
++      struct ethhdr *eh = (struct ethhdr *)(skb->mac.raw);
++
++      hdr_to_filt(eh, skb->nh.iph, &fs);
++      
++      netback_accel_filter_check_add(bend, &fs);
++}
++
++
++/*
++ * Receive a packet and do something appropriate with it. Return true
++ * to take exclusive ownership of the packet.  This is verging on
++ * solarflare specific
++ */
++void netback_accel_rx_packet(struct netback_pkt_buf *skb, void *fwd_priv)
++{
++      struct netback_accel *bend;
++      struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
++      unsigned long flags;
++
++      BUG_ON(fwd_priv == NULL);
++
++      /* Checking for bcast is cheaper so do that first */
++      if (is_broadcast_ether_addr(skb->mac.raw)) {
++              /* pass through the slow path by not claiming ownership */
++              return;
++      } else if (is_multicast_ether_addr(skb->mac.raw)) {
++              /* pass through the slow path by not claiming ownership */
++              return;
++      } else {
++              /* It is unicast */
++              spin_lock_irqsave(&fwd_set->fwd_lock, flags);
++              /* We insert filter to pass it off to a VNIC */
++              if ((bend = for_a_vnic(skb, fwd_set)) != NULL)
++                      if (netback_accel_can_filter(skb))
++                              netback_accel_filter_packet(bend, skb);
++              spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
++      }
++      return;
++}
++
++
++void netback_accel_tx_packet(struct sk_buff *skb, void *fwd_priv) 
++{
++      __u8 *mac;
++      unsigned long flags;
++      struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
++      struct fwd_struct *fwd;
++
++      BUG_ON(fwd_priv == NULL);
++
++      if (is_broadcast_ether_addr(skb_mac_header(skb))
++          && packet_is_arp_reply(skb)) {
++              /*
++               * update our fast path forwarding to reflect this
++               * gratuitous ARP
++               */ 
++              mac = skb_mac_header(skb)+ETH_ALEN;
++
++              DPRINTK("%s: found gratuitous ARP for %pM\n",
++                      __FUNCTION__, mac);
++
++              spin_lock_irqsave(&fwd_set->fwd_lock, flags);
++              /*
++               * Might not be local, but let's tell them all it is,
++               * and they can restore the fastpath if they continue
++               * to get packets that way
++               */
++              list_for_each_entry(fwd, &fwd_set->fwd_list, link) {
++                      struct netback_accel *bend = fwd->context;
++                      if (bend != NULL)
++                              netback_accel_msg_tx_new_localmac(bend, mac);
++              }
++
++              spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
++      }
++      return;
++}
diff --cc drivers/xen/sfc_netback/accel_msg.c

index 0000000,0000000..b8982a7

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_msg.c
@@@ -1,0 -1,0 +1,391 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <xen/evtchn.h>
++
++#include "accel.h"
++#include "accel_msg_iface.h"
++#include "accel_util.h"
++#include "accel_solarflare.h"
++
++/* Send a HELLO to front end to start things off */
++void netback_accel_msg_tx_hello(struct netback_accel *bend, unsigned version)
++{
++      unsigned long lock_state;
++      struct net_accel_msg *msg = 
++              net_accel_msg_start_send(bend->shared_page,
++                                       &bend->to_domU, &lock_state);
++      /* The queue _cannot_ be full, we're the first users. */
++      EPRINTK_ON(msg == NULL);
++
++      if (msg != NULL) {
++              net_accel_msg_init(msg, NET_ACCEL_MSG_HELLO);
++              msg->u.hello.version = version;
++              msg->u.hello.max_pages = bend->quotas.max_buf_pages; 
++              VPRINTK("Sending hello to channel %d\n", bend->msg_channel);
++              net_accel_msg_complete_send_notify(bend->shared_page, 
++                                                 &bend->to_domU,
++                                                 &lock_state, 
++                                                 bend->msg_channel_irq);
++      }
++}
++
++/* Send a local mac message to vnic */
++static void netback_accel_msg_tx_localmac(struct netback_accel *bend, 
++                                        int type, const void *mac)
++{
++      unsigned long lock_state;
++      struct net_accel_msg *msg;
++
++      BUG_ON(bend == NULL || mac == NULL);
++
++      VPRINTK("Sending local mac message: %pM\n", mac);
++      
++      msg = net_accel_msg_start_send(bend->shared_page, &bend->to_domU,
++                                     &lock_state);
++      
++      if (msg != NULL) {
++              net_accel_msg_init(msg, NET_ACCEL_MSG_LOCALMAC);
++              msg->u.localmac.flags = type;
++              memcpy(msg->u.localmac.mac, mac, ETH_ALEN);
++              net_accel_msg_complete_send_notify(bend->shared_page, 
++                                                 &bend->to_domU,
++                                                 &lock_state, 
++                                                 bend->msg_channel_irq);
++      } else {
++              /*
++               * TODO if this happens we may leave a domU
++               * fastpathing packets when they should be delivered
++               * locally.  Solution is get domU to timeout entries
++               * in its fastpath lookup table when it receives no RX
++               * traffic
++               */
++              EPRINTK("%s: saw full queue, may need ARP timer to recover\n",
++                      __FUNCTION__);
++      }
++}
++
++/* Send an add local mac message to vnic */
++void netback_accel_msg_tx_new_localmac(struct netback_accel *bend,
++                                     const void *mac)
++{
++      netback_accel_msg_tx_localmac(bend, NET_ACCEL_MSG_ADD, mac);
++}
++
++
++static int netback_accel_msg_rx_buffer_map(struct netback_accel *bend, 
++                                         struct net_accel_msg *msg)
++{
++      int log2_pages, rc;
++
++      /* Can only allocate in power of two */
++      log2_pages = log2_ge(msg->u.mapbufs.pages, 0);
++      if (msg->u.mapbufs.pages != pow2(log2_pages)) {
++              EPRINTK("%s: Can only alloc bufs in power of 2 sizes (%d)\n",
++                      __FUNCTION__, msg->u.mapbufs.pages);
++              rc = -EINVAL;
++              goto err_out;
++      }
++  
++      /*
++       * Sanity.  Assumes NET_ACCEL_MSG_MAX_PAGE_REQ is same for
++       * both directions/domains
++       */
++      if (msg->u.mapbufs.pages > NET_ACCEL_MSG_MAX_PAGE_REQ) {
++              EPRINTK("%s: too many pages in a single message: %d %d\n", 
++                      __FUNCTION__, msg->u.mapbufs.pages,
++                      NET_ACCEL_MSG_MAX_PAGE_REQ);
++              rc = -EINVAL;
++              goto err_out;
++      }
++  
++      if ((rc = netback_accel_add_buffers(bend, msg->u.mapbufs.pages, 
++                                          log2_pages, msg->u.mapbufs.grants, 
++                                          &msg->u.mapbufs.buf)) < 0) {
++              goto err_out;
++      }
++
++      msg->id |= NET_ACCEL_MSG_REPLY;
++  
++      return 0;
++
++ err_out:
++      EPRINTK("%s: err_out\n", __FUNCTION__);
++      msg->id |= NET_ACCEL_MSG_ERROR | NET_ACCEL_MSG_REPLY;
++      return rc;
++}
++
++
++/* Hint from frontend that one of our filters is out of date */
++static int netback_accel_process_fastpath(struct netback_accel *bend, 
++                                        struct net_accel_msg *msg)
++{
++      struct netback_accel_filter_spec spec;
++
++      if (msg->u.fastpath.flags & NET_ACCEL_MSG_REMOVE) {
++              /* 
++               * Would be nice to BUG() this but would leave us
++               * vulnerable to naughty frontend
++               */
++              EPRINTK_ON(msg->u.fastpath.flags & NET_ACCEL_MSG_ADD);
++              
++              memcpy(spec.mac, msg->u.fastpath.mac, ETH_ALEN);
++              spec.destport_be = msg->u.fastpath.port;
++              spec.destip_be = msg->u.fastpath.ip;
++              spec.proto = msg->u.fastpath.proto;
++
++              netback_accel_filter_remove_spec(bend, &spec);
++      }
++
++      return 0;
++}
++
++
++/* Flow control for message queues */
++inline void set_queue_not_full(struct netback_accel *bend)
++{
++      if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B, 
++                            (unsigned long *)&bend->shared_page->aflags))
++              notify_remote_via_irq(bend->msg_channel_irq);
++      else
++              VPRINTK("queue not full bit already set, not signalling\n");
++}
++
++
++/* Flow control for message queues */
++inline void set_queue_full(struct netback_accel *bend)
++{
++      if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B,
++                            (unsigned long *)&bend->shared_page->aflags))
++              notify_remote_via_irq(bend->msg_channel_irq);
++      else
++              VPRINTK("queue full bit already set, not signalling\n");
++}
++
++
++void netback_accel_set_interface_state(struct netback_accel *bend, int up)
++{
++      bend->shared_page->net_dev_up = up;
++      if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B, 
++                           (unsigned long *)&bend->shared_page->aflags))
++              notify_remote_via_irq(bend->msg_channel_irq);
++      else
++              VPRINTK("interface up/down bit already set, not signalling\n");
++}
++
++
++static int check_rx_hello_version(unsigned version) 
++{
++      /* Should only happen if there's been a version mismatch */
++      BUG_ON(version == NET_ACCEL_MSG_VERSION);
++
++      if (version > NET_ACCEL_MSG_VERSION) {
++              /* Newer protocol, we must refuse */
++              return -EPROTO;
++      }
++
++      if (version < NET_ACCEL_MSG_VERSION) {
++              /*
++               * We are newer, so have discretion to accept if we
++               * wish.  For now however, just reject
++               */
++              return -EPROTO;
++      }
++
++      return -EINVAL;
++}
++
++
++static int process_rx_msg(struct netback_accel *bend,
++                        struct net_accel_msg *msg)
++{
++      int err = 0;
++                    
++      switch (msg->id) {
++      case NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_HELLO:
++              /* Reply to a HELLO; mark ourselves as connected */
++              DPRINTK("got Hello reply, version %.8x\n",
++                      msg->u.hello.version);
++              
++              /*
++               * Check that we've not successfully done this
++               * already.  NB no check at the moment that this reply
++               * comes after we've actually sent a HELLO as that's
++               * not possible with the current code structure
++               */
++              if (bend->hw_state != NETBACK_ACCEL_RES_NONE)
++                      return -EPROTO;
++
++              /* Store max_pages for accel_setup */
++              if (msg->u.hello.max_pages > bend->quotas.max_buf_pages) {
++                      EPRINTK("More pages than quota allows (%d > %d)\n",
++                              msg->u.hello.max_pages, 
++                              bend->quotas.max_buf_pages);
++                      /* Force it down to the quota */
++                      msg->u.hello.max_pages = bend->quotas.max_buf_pages;
++              }
++              bend->max_pages = msg->u.hello.max_pages;
++              
++              /* Set up the hardware visible to the other end */
++              err = bend->accel_setup(bend);
++              if (err) {
++                      /* This is fatal */
++                      DPRINTK("Hello gave accel_setup error %d\n", err);
++                      netback_accel_set_closing(bend);
++              } else {
++                      /*
++                       * Now add the context so that packet
++                       * forwarding will commence
++                       */
++                      netback_accel_fwd_set_context(bend->mac, bend, 
++                                                    bend->fwd_priv);
++              }
++              break;
++      case NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_ERROR:
++              EPRINTK("got Hello error, versions us:%.8x them:%.8x\n",
++                      NET_ACCEL_MSG_VERSION, msg->u.hello.version);
++
++              if (bend->hw_state != NETBACK_ACCEL_RES_NONE)
++                      return -EPROTO;
++
++              if (msg->u.hello.version != NET_ACCEL_MSG_VERSION) {
++                      /* Error is due to version mismatch */
++                      err = check_rx_hello_version(msg->u.hello.version);
++                      if (err == 0) {
++                              /*
++                               * It's OK to be compatible, send
++                               * another hello with compatible version
++                               */
++                              netback_accel_msg_tx_hello
++                                      (bend, msg->u.hello.version);
++                      } else {
++                              /*
++                               * Tell frontend that we're not going to
++                               * send another HELLO by going to Closing.
++                               */
++                              netback_accel_set_closing(bend);
++                      }
++              } 
++              break;
++      case NET_ACCEL_MSG_MAPBUF:
++              VPRINTK("Got mapped buffers request %d\n",
++                      msg->u.mapbufs.reqid);
++
++              if (bend->hw_state == NETBACK_ACCEL_RES_NONE)
++                      return -EPROTO;
++
++              /*
++               * Frontend wants a buffer table entry for the
++               * supplied pages
++               */
++              err = netback_accel_msg_rx_buffer_map(bend, msg);
++              if (net_accel_msg_reply_notify(bend->shared_page,
++                                             bend->msg_channel_irq, 
++                                             &bend->to_domU, msg)) {
++                      /*
++                       * This is fatal as we can't tell the frontend
++                       * about the problem through the message
++                       * queue, and so would otherwise stalemate
++                       */
++                      netback_accel_set_closing(bend);
++              }
++              break;
++      case NET_ACCEL_MSG_FASTPATH:
++              DPRINTK("Got fastpath request\n");
++
++              if (bend->hw_state == NETBACK_ACCEL_RES_NONE)
++                      return -EPROTO;
++
++              err = netback_accel_process_fastpath(bend, msg);
++              break;
++      default:
++              EPRINTK("Huh? Message code is %x\n", msg->id);
++              err = -EPROTO;
++              break;
++      }
++      return err;
++}
++
++
++/*  Demultiplex an IRQ from the frontend driver.  */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
++void netback_accel_msg_rx_handler(struct work_struct *arg)
++#else
++void netback_accel_msg_rx_handler(void *bend_void)
++#endif
++{
++      struct net_accel_msg msg;
++      int err, queue_was_full = 0;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
++      struct netback_accel *bend = 
++              container_of(arg, struct netback_accel, handle_msg);
++#else
++      struct netback_accel *bend = (struct netback_accel *)bend_void;
++#endif
++
++      mutex_lock(&bend->bend_mutex);
++
++      /*
++       * This happens when the shared pages have been unmapped, but
++       * the workqueue not flushed yet
++       */
++      if (bend->shared_page == NULL)
++              goto done;
++
++      if ((bend->shared_page->aflags &
++           NET_ACCEL_MSG_AFLAGS_TO_DOM0_MASK) != 0) {
++              if (bend->shared_page->aflags &
++                  NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL) {
++                      /* We've been told there may now be space. */
++                      clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B, 
++                                (unsigned long *)&bend->shared_page->aflags);
++              }
++
++              if (bend->shared_page->aflags &
++                  NET_ACCEL_MSG_AFLAGS_QUEUEUFULL) {
++                      clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B, 
++                                (unsigned long *)&bend->shared_page->aflags);
++                      queue_was_full = 1;
++              }
++      }
++
++      while ((err = net_accel_msg_recv(bend->shared_page, &bend->from_domU,
++                                       &msg)) == 0) {
++              err = process_rx_msg(bend, &msg);
++              
++              if (err != 0) {
++                      EPRINTK("%s: Error %d\n", __FUNCTION__, err);
++                      goto err;
++              }
++      }
++
++ err:
++      /* There will be space now if we can make any. */
++      if (queue_was_full) 
++              set_queue_not_full(bend);
++ done:
++      mutex_unlock(&bend->bend_mutex);
++
++      return;
++}
diff --cc drivers/xen/sfc_netback/accel_solarflare.c

index 0000000,0000000..f5809a2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_solarflare.c
@@@ -1,0 -1,0 +1,1292 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include "common.h"
++
++#include "accel.h"
++#include "accel_solarflare.h"
++#include "accel_msg_iface.h"
++#include "accel_util.h"
++
++#include "accel_cuckoo_hash.h"
++
++#include "ci/driver/resource/efx_vi.h"
++
++#include "ci/efrm/nic_table.h" 
++#include "ci/efhw/public.h"
++
++#include <xen/evtchn.h>
++#include <linux/list.h>
++#include <linux/mutex.h>
++
++#include "driverlink_api.h"
++
++#define SF_XEN_RX_USR_BUF_SIZE 2048
++
++struct falcon_bend_accel_priv {
++      struct efx_vi_state *efx_vih;
++
++      /*! Array of pointers to dma_map state, used so VNIC can
++       *  request their removal in a single message
++       */
++      struct efx_vi_dma_map_state **dma_maps;
++      /*! Index into dma_maps */
++      int dma_maps_index; 
++
++      /*! Serialises access to filters */
++      spinlock_t filter_lock;      
++      /*! Bitmap of which filters are free */
++      unsigned long free_filters;      
++      /*! Used for index normalisation */
++      u32 filter_idx_mask;            
++      struct netback_accel_filter_spec *fspecs; 
++      cuckoo_hash_table filter_hash_table;
++
++      u32 txdmaq_gnt;
++      u32 rxdmaq_gnt;
++      u32 doorbell_gnt;
++      u32 evq_rptr_gnt;
++      u32 evq_mem_gnts[EF_HW_FALCON_EVQ_PAGES];
++      u32 evq_npages;
++};
++
++/* Forward declaration */
++static int netback_accel_filter_init(struct netback_accel *);
++static void netback_accel_filter_shutdown(struct netback_accel *);
++
++/**************************************************************************
++ * 
++ * Driverlink stuff
++ *
++ **************************************************************************/
++
++struct driverlink_port {
++      struct list_head link;
++      enum net_accel_hw_type type;
++      struct net_device *net_dev;
++      struct efx_dl_device *efx_dl_dev;
++      void *fwd_priv;
++};
++
++static struct list_head dl_ports;
++
++/* This mutex protects global state, such as the dl_ports list */
++DEFINE_MUTEX(accel_mutex);
++
++static int init_done = 0;
++
++/* The DL callbacks */
++
++
++#if defined(EFX_USE_FASTCALL)
++static enum efx_veto fastcall
++#else
++static enum efx_veto
++#endif
++bend_dl_tx_packet(struct efx_dl_device *efx_dl_dev,
++                struct sk_buff *skb)
++{
++      struct driverlink_port *port = efx_dl_dev->priv;
++
++      BUG_ON(port == NULL);
++
++      NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_packets++);
++      if (skb_mac_header_was_set(skb))
++              netback_accel_tx_packet(skb, port->fwd_priv);
++      else {
++              DPRINTK("Ignoring packet with missing mac address\n");
++              NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_bad_packets++);
++      }
++      return EFX_ALLOW_PACKET;
++}
++
++/* EFX_USE_FASTCALL */
++#if defined(EFX_USE_FASTCALL)
++static enum efx_veto fastcall
++#else
++static enum efx_veto
++#endif
++bend_dl_rx_packet(struct efx_dl_device *efx_dl_dev,
++                const char *pkt_buf, int pkt_len)
++{
++      struct driverlink_port *port = efx_dl_dev->priv;
++      struct netback_pkt_buf pkt;
++      struct ethhdr *eh;
++
++      BUG_ON(port == NULL);
++
++      pkt.mac.raw = (char *)pkt_buf;
++      pkt.nh.raw = (char *)pkt_buf + ETH_HLEN;
++      eh = (struct ethhdr *)pkt_buf;
++      pkt.protocol = eh->h_proto;
++
++      NETBACK_ACCEL_STATS_OP(global_stats.dl_rx_packets++);
++      netback_accel_rx_packet(&pkt, port->fwd_priv);
++      return EFX_ALLOW_PACKET;
++}
++
++
++/* Callbacks we'd like to get from the netdriver through driverlink */
++struct efx_dl_callbacks bend_dl_callbacks =
++      {
++              .tx_packet = bend_dl_tx_packet,
++              .rx_packet = bend_dl_rx_packet,
++      };
++
++
++static struct netback_accel_hooks accel_hooks = {
++      THIS_MODULE,
++      &netback_accel_probe,
++      &netback_accel_remove
++};
++
++
++/* Driver link probe - register our callbacks */
++static int bend_dl_probe(struct efx_dl_device *efx_dl_dev,
++                       const struct net_device *net_dev,
++                       const struct efx_dl_device_info *dev_info,
++                       const char* silicon_rev)
++{
++      int rc;
++      enum net_accel_hw_type type;
++      struct driverlink_port *port;
++
++      DPRINTK("%s: %s\n", __FUNCTION__, silicon_rev);
++
++      if (strcmp(silicon_rev, "falcon/a1") == 0)
++              type = NET_ACCEL_MSG_HWTYPE_FALCON_A;
++      else if (strcmp(silicon_rev, "falcon/b0") == 0)
++              type = NET_ACCEL_MSG_HWTYPE_FALCON_B;
++      else if (strcmp(silicon_rev, "siena/a0") == 0)
++              type = NET_ACCEL_MSG_HWTYPE_SIENA_A;
++      else {
++              EPRINTK("%s: unsupported silicon %s\n", __FUNCTION__,
++                      silicon_rev);
++              rc = -EINVAL;
++              goto fail1;
++      }
++      
++      port = kmalloc(sizeof(struct driverlink_port), GFP_KERNEL);
++      if (port == NULL) {
++              EPRINTK("%s: no memory for dl probe\n", __FUNCTION__);
++              rc = -ENOMEM;
++              goto fail1;
++      }
++
++      port->efx_dl_dev = efx_dl_dev;
++      efx_dl_dev->priv = port;
++
++      port->fwd_priv = netback_accel_init_fwd_port();
++      if (port->fwd_priv == NULL) {
++              EPRINTK("%s: failed to set up forwarding for port\n",
++                      __FUNCTION__);
++              rc = -ENOMEM;
++              goto fail2;
++      }
++
++      rc = efx_dl_register_callbacks(efx_dl_dev, &bend_dl_callbacks);
++      if (rc != 0) {
++              EPRINTK("%s: register_callbacks failed\n", __FUNCTION__);
++              goto fail3;
++      }
++
++      port->type = type;
++      port->net_dev = (struct net_device *)net_dev;
++
++      mutex_lock(&accel_mutex);
++      list_add(&port->link, &dl_ports);
++      mutex_unlock(&accel_mutex);
++
++      rc = netback_connect_accelerator(NETBACK_ACCEL_VERSION, 0,
++                                       port->net_dev->name, &accel_hooks);
++
++      if (rc < 0) {
++              EPRINTK("Xen netback accelerator version mismatch\n");
++              goto fail4;
++      } else if (rc > 0) {
++              /*
++               * In future may want to add backwards compatibility
++               * and accept certain subsets of previous versions
++               */
++              EPRINTK("Xen netback accelerator version mismatch\n");
++              goto fail4;
++      } 
++
++      return 0;
++
++ fail4:
++      mutex_lock(&accel_mutex);
++      list_del(&port->link);
++      mutex_unlock(&accel_mutex);
++
++      efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks);
++ fail3: 
++      netback_accel_shutdown_fwd_port(port->fwd_priv);
++ fail2:
++      efx_dl_dev->priv = NULL;
++      kfree(port);
++ fail1:
++      return rc;
++}
++
++
++static void bend_dl_remove(struct efx_dl_device *efx_dl_dev)
++{
++      struct driverlink_port *port;
++
++      DPRINTK("Unregistering driverlink callbacks.\n");
++
++      mutex_lock(&accel_mutex);
++
++      port = (struct driverlink_port *)efx_dl_dev->priv;
++
++      BUG_ON(list_empty(&dl_ports));
++      BUG_ON(port == NULL);
++      BUG_ON(port->efx_dl_dev != efx_dl_dev);
++
++      netback_disconnect_accelerator(0, port->net_dev->name);
++
++      list_del(&port->link);
++
++      mutex_unlock(&accel_mutex);
++
++      efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks);
++      netback_accel_shutdown_fwd_port(port->fwd_priv);
++
++      efx_dl_dev->priv = NULL;
++      kfree(port);
++
++      return;
++}
++
++
++static void bend_dl_reset_suspend(struct efx_dl_device *efx_dl_dev)
++{
++      struct driverlink_port *port;
++
++      DPRINTK("Driverlink reset suspend.\n");
++
++      mutex_lock(&accel_mutex);
++
++      port = (struct driverlink_port *)efx_dl_dev->priv;
++      BUG_ON(list_empty(&dl_ports));
++      BUG_ON(port == NULL);
++      BUG_ON(port->efx_dl_dev != efx_dl_dev);
++
++      netback_disconnect_accelerator(0, port->net_dev->name);
++      mutex_unlock(&accel_mutex);
++}
++
++
++static void bend_dl_reset_resume(struct efx_dl_device *efx_dl_dev, int ok)
++{
++      int rc;
++      struct driverlink_port *port;
++
++      DPRINTK("Driverlink reset resume.\n");
++      
++      if (!ok)
++              return;
++
++      port = (struct driverlink_port *)efx_dl_dev->priv;
++      BUG_ON(list_empty(&dl_ports));
++      BUG_ON(port == NULL);
++      BUG_ON(port->efx_dl_dev != efx_dl_dev);
++
++      rc = netback_connect_accelerator(NETBACK_ACCEL_VERSION, 0,
++                                       port->net_dev->name, &accel_hooks);
++      if (rc != 0) {
++              EPRINTK("Xen netback accelerator version mismatch\n");
++
++              mutex_lock(&accel_mutex);
++              list_del(&port->link);
++              mutex_unlock(&accel_mutex);
++
++              efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks);
++
++              netback_accel_shutdown_fwd_port(port->fwd_priv);
++
++              efx_dl_dev->priv = NULL;
++              kfree(port);
++      }
++}
++
++
++static struct efx_dl_driver bend_dl_driver = 
++      {
++              .name = "SFC Xen backend",
++              .probe = bend_dl_probe,
++              .remove = bend_dl_remove,
++              .reset_suspend = bend_dl_reset_suspend,
++              .reset_resume = bend_dl_reset_resume
++      };
++
++
++int netback_accel_sf_init(void)
++{
++      int rc, nic_i;
++      struct efhw_nic *nic;
++
++      INIT_LIST_HEAD(&dl_ports);
++
++      rc = efx_dl_register_driver(&bend_dl_driver);
++      /* If we couldn't find the NET driver, give up */
++      if (rc == -ENOENT)
++              return rc;
++      
++      if (rc == 0) {
++              EFRM_FOR_EACH_NIC(nic_i, nic)
++                      falcon_nic_set_rx_usr_buf_size(nic, 
++                                                     SF_XEN_RX_USR_BUF_SIZE);
++      }
++
++      init_done = (rc == 0);
++      return rc;
++}
++
++
++void netback_accel_sf_shutdown(void)
++{
++      if (!init_done)
++              return;
++      DPRINTK("Unregistering driverlink driver\n");
++
++      /*
++       * This will trigger removal callbacks for all the devices, which
++       * will unregister their callbacks, disconnect from netfront, etc.
++       */
++      efx_dl_unregister_driver(&bend_dl_driver);
++}
++
++
++int netback_accel_sf_hwtype(struct netback_accel *bend)
++{
++      struct driverlink_port *port;
++
++      mutex_lock(&accel_mutex);
++
++      list_for_each_entry(port, &dl_ports, link) {
++              if (strcmp(bend->nicname, port->net_dev->name) == 0) {
++                      bend->hw_type = port->type;
++                      bend->accel_setup = netback_accel_setup_vnic_hw;
++                      bend->accel_shutdown = netback_accel_shutdown_vnic_hw;
++                      bend->fwd_priv = port->fwd_priv;
++                      bend->net_dev = port->net_dev;
++                      mutex_unlock(&accel_mutex);
++                      return 0;
++              }
++      }
++
++      mutex_unlock(&accel_mutex);
++
++      EPRINTK("Failed to identify backend device '%s' with a NIC\n",
++              bend->nicname);
++
++      return -ENOENT;
++}
++
++
++/****************************************************************************
++ * Resource management code
++ ***************************************************************************/
++
++static int alloc_page_state(struct netback_accel *bend, int max_pages)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv;
++
++      if (max_pages < 0 || max_pages > bend->quotas.max_buf_pages) {
++              EPRINTK("%s: invalid max_pages: %d\n", __FUNCTION__, max_pages);
++              return -EINVAL;
++      }
++
++      accel_hw_priv = kzalloc(sizeof(struct falcon_bend_accel_priv),
++                              GFP_KERNEL);
++      if (accel_hw_priv == NULL) {
++              EPRINTK("%s: no memory for accel_hw_priv\n", __FUNCTION__);
++              return -ENOMEM;
++      }
++
++      accel_hw_priv->dma_maps = kzalloc
++              (sizeof(struct efx_vi_dma_map_state **) * 
++               (max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ), GFP_KERNEL);
++      if (accel_hw_priv->dma_maps == NULL) {
++              EPRINTK("%s: no memory for dma_maps\n", __FUNCTION__);
++              kfree(accel_hw_priv);
++              return -ENOMEM;
++      }
++
++      bend->buffer_maps = kzalloc(sizeof(struct vm_struct *) * max_pages, 
++                                  GFP_KERNEL);
++      if (bend->buffer_maps == NULL) {
++              EPRINTK("%s: no memory for buffer_maps\n", __FUNCTION__);
++              kfree(accel_hw_priv->dma_maps);
++              kfree(accel_hw_priv);
++              return -ENOMEM;
++      }
++
++      bend->buffer_addrs = kzalloc(sizeof(u64) * max_pages, GFP_KERNEL);
++      if (bend->buffer_addrs == NULL) {
++              kfree(bend->buffer_maps);
++              kfree(accel_hw_priv->dma_maps);
++              kfree(accel_hw_priv);
++              return -ENOMEM;
++      }
++
++      bend->accel_hw_priv = accel_hw_priv;
++
++      return 0;
++}
++
++
++static int free_page_state(struct netback_accel *bend)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv;
++
++      DPRINTK("%s: %p\n", __FUNCTION__, bend);
++
++      accel_hw_priv = bend->accel_hw_priv;
++
++      if (accel_hw_priv) {
++              kfree(accel_hw_priv->dma_maps);
++              kfree(bend->buffer_maps);
++              kfree(bend->buffer_addrs);
++              kfree(accel_hw_priv);
++              bend->accel_hw_priv = NULL;
++              bend->max_pages = 0;
++      }
++
++      return 0;
++}
++
++
++/* The timeout event callback for the event q */
++static void bend_evq_timeout(void *context, int is_timeout)
++{
++      struct netback_accel *bend = (struct netback_accel *)context;
++      if (is_timeout) {
++              /* Pass event to vnic front end driver */
++              VPRINTK("timeout event to %d\n", bend->net_channel);
++              NETBACK_ACCEL_STATS_OP(bend->stats.evq_timeouts++);
++              notify_remote_via_irq(bend->net_channel_irq);
++      } else {
++              /* It's a wakeup event, used by Falcon */
++              VPRINTK("wakeup to %d\n", bend->net_channel);
++              NETBACK_ACCEL_STATS_OP(bend->stats.evq_wakeups++);
++              notify_remote_via_irq(bend->net_channel_irq);
++      }
++}
++
++
++/*
++ * Create the eventq and associated gubbins for communication with the
++ * front end vnic driver
++ */
++static int ef_get_vnic(struct netback_accel *bend)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv;
++      int rc = 0;
++
++      BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_NONE);
++
++      /* Allocate page related state and accel_hw_priv */
++      rc = alloc_page_state(bend, bend->max_pages);
++      if (rc != 0) {
++              EPRINTK("Failed to allocate page state: %d\n", rc);
++              return rc;
++      }
++
++      accel_hw_priv = bend->accel_hw_priv;
++
++      rc = efx_vi_alloc(&accel_hw_priv->efx_vih, bend->net_dev->ifindex);
++      if (rc != 0) {
++              EPRINTK("%s: efx_vi_alloc failed %d\n", __FUNCTION__, rc);
++              free_page_state(bend);
++              return rc;
++      }
++
++      rc = efx_vi_eventq_register_callback(accel_hw_priv->efx_vih,
++                                           bend_evq_timeout,
++                                           bend);
++      if (rc != 0) {
++              EPRINTK("%s: register_callback failed %d\n", __FUNCTION__, rc);
++              efx_vi_free(accel_hw_priv->efx_vih);
++              free_page_state(bend);
++              return rc;
++      }
++
++      bend->hw_state = NETBACK_ACCEL_RES_ALLOC;
++      
++      return 0;
++}
++
++
++static void ef_free_vnic(struct netback_accel *bend)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
++
++      BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_ALLOC);
++
++      efx_vi_eventq_kill_callback(accel_hw_priv->efx_vih);
++
++      DPRINTK("Hardware is freeable. Will proceed.\n");
++
++      efx_vi_free(accel_hw_priv->efx_vih);
++      accel_hw_priv->efx_vih = NULL;
++
++      VPRINTK("Free page state...\n");
++      free_page_state(bend);
++
++      bend->hw_state = NETBACK_ACCEL_RES_NONE;
++}
++
++
++static inline void ungrant_or_crash(grant_ref_t gntref, int domain) {
++      if (net_accel_ungrant_page(gntref) == -EBUSY)
++              net_accel_shutdown_remote(domain);
++}
++
++
++static void netback_accel_release_hwinfo(struct netback_accel *bend)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
++      int i;
++
++      DPRINTK("Remove dma q grants %d %d\n", accel_hw_priv->txdmaq_gnt,
++              accel_hw_priv->rxdmaq_gnt);
++      ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end);
++      ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end);
++
++      DPRINTK("Remove doorbell grant %d\n", accel_hw_priv->doorbell_gnt);
++      ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end);
++
++      if (bend->hw_type == NET_ACCEL_MSG_HWTYPE_FALCON_A) {
++              DPRINTK("Remove rptr grant %d\n", accel_hw_priv->evq_rptr_gnt);
++              ungrant_or_crash(accel_hw_priv->evq_rptr_gnt, bend->far_end);
++      }
++
++      for (i = 0; i < accel_hw_priv->evq_npages; i++) {
++              DPRINTK("Remove evq grant %d\n", accel_hw_priv->evq_mem_gnts[i]);
++              ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i], bend->far_end);
++      }
++
++      bend->hw_state = NETBACK_ACCEL_RES_FILTER;
++
++      return;
++}
++
++
++static int ef_bend_hwinfo_falcon_common(struct netback_accel *bend, 
++                                      struct net_accel_hw_falcon_b *hwinfo)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
++      struct efx_vi_hw_resource_metadata res_mdata;
++      struct efx_vi_hw_resource res_array[EFX_VI_HW_RESOURCE_MAXSIZE];
++      int rc, len = EFX_VI_HW_RESOURCE_MAXSIZE, i, pfn = 0;
++      unsigned long txdmaq_pfn = 0, rxdmaq_pfn = 0;
++
++      rc = efx_vi_hw_resource_get_phys(accel_hw_priv->efx_vih, &res_mdata,
++                                       res_array, &len);
++      if (rc != 0) {
++              DPRINTK("%s: resource_get_phys returned %d\n",
++                      __FUNCTION__, rc);
++              return rc;
++      }
++
++      hwinfo->nic_arch = res_mdata.nic_arch;
++      hwinfo->nic_variant = res_mdata.nic_variant;
++      hwinfo->nic_revision = res_mdata.nic_revision;
++
++      hwinfo->evq_order = res_mdata.evq_order;
++      hwinfo->evq_offs = res_mdata.evq_offs;
++      hwinfo->evq_capacity = res_mdata.evq_capacity;
++      hwinfo->instance = res_mdata.instance;
++      hwinfo->rx_capacity = res_mdata.rx_capacity;
++      hwinfo->tx_capacity = res_mdata.tx_capacity;
++
++      VPRINTK("evq_order %d evq_offs %d evq_cap %d inst %d rx_cap %d tx_cap %d\n",
++              hwinfo->evq_order, hwinfo->evq_offs, hwinfo->evq_capacity,
++              hwinfo->instance, hwinfo->rx_capacity, hwinfo->tx_capacity);
++
++      for (i = 0; i < len; i++) {
++              struct efx_vi_hw_resource *res = &(res_array[i]);
++              switch (res->type) {
++              case EFX_VI_HW_RESOURCE_TXDMAQ:
++                      txdmaq_pfn = page_to_pfn(virt_to_page(res->address));
++                      break;
++              case EFX_VI_HW_RESOURCE_RXDMAQ: 
++                      rxdmaq_pfn = page_to_pfn(virt_to_page(res->address));
++                      break;
++              case EFX_VI_HW_RESOURCE_EVQTIMER:
++                      break;
++              case EFX_VI_HW_RESOURCE_EVQRPTR:
++              case EFX_VI_HW_RESOURCE_EVQRPTR_OFFSET:
++                      hwinfo->evq_rptr = res->address;
++                      break;
++              case EFX_VI_HW_RESOURCE_EVQMEMKVA: 
++                      accel_hw_priv->evq_npages =  1 << res_mdata.evq_order;
++                      pfn = page_to_pfn(virt_to_page(res->address));
++                      break;
++              case EFX_VI_HW_RESOURCE_BELLPAGE:
++                      hwinfo->doorbell_mfn  = res->address;
++                      break;
++              default:
++                      EPRINTK("%s: Unknown hardware resource type %d\n",
++                              __FUNCTION__, res->type);
++                      break;
++              }
++      }
++
++      VPRINTK("Passing txdmaq page pfn %lx\n", txdmaq_pfn);
++      rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(txdmaq_pfn), 0);
++      if (rc < 0)
++              goto fail0;
++      accel_hw_priv->txdmaq_gnt = hwinfo->txdmaq_gnt = rc;
++
++      VPRINTK("Passing rxdmaq page pfn %lx\n", rxdmaq_pfn);
++      rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(rxdmaq_pfn), 0);
++      if (rc < 0)
++              goto fail1;
++      accel_hw_priv->rxdmaq_gnt = hwinfo->rxdmaq_gnt = rc;
++
++      VPRINTK("Passing doorbell page mfn %x\n", hwinfo->doorbell_mfn);
++      /* Make the relevant H/W pages mappable by the far end */
++      rc = net_accel_grant_page(bend->hdev_data, hwinfo->doorbell_mfn, 1);
++      if (rc < 0)
++              goto fail2;
++      accel_hw_priv->doorbell_gnt = hwinfo->doorbell_gnt = rc;
++      
++      /* Now do the same for the memory pages */
++      /* Convert the page + length we got back for the evq to grants. */
++      for (i = 0; i < accel_hw_priv->evq_npages; i++) {
++              rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(pfn), 0);
++              if (rc < 0)
++                      goto fail3;
++              accel_hw_priv->evq_mem_gnts[i] = hwinfo->evq_mem_gnts[i] = rc;
++
++              VPRINTK("Got grant %u for evq pfn %x\n", hwinfo->evq_mem_gnts[i], 
++                      pfn);
++              pfn++;
++      }
++
++      return 0;
++
++ fail3:
++      for (i = i - 1; i >= 0; i--) {
++              ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i], bend->far_end);
++      }
++      ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end);
++ fail2:
++      ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end);
++ fail1:
++      ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end);     
++ fail0:
++      return rc;
++}
++
++
++static int ef_bend_hwinfo_falcon_a(struct netback_accel *bend, 
++                                 struct net_accel_hw_falcon_a *hwinfo)
++{
++      int rc, i;
++      struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
++
++      if ((rc = ef_bend_hwinfo_falcon_common(bend, &hwinfo->common)) != 0)
++              return rc;
++
++      /*
++       * Note that unlike the above, where the message field is the
++       * page number, here evq_rptr is the entire address because
++       * it is currently a pointer into the densely mapped timer page.
++       */
++      VPRINTK("Passing evq_rptr pfn %x for rptr %x\n", 
++              hwinfo->common.evq_rptr >> PAGE_SHIFT,
++              hwinfo->common.evq_rptr);
++      rc = net_accel_grant_page(bend->hdev_data, 
++                                hwinfo->common.evq_rptr >> PAGE_SHIFT, 0);
++      if (rc < 0) {
++              /* Undo ef_bend_hwinfo_falcon_common() */
++              ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end);
++              ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end);
++              ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end);
++              for (i = 0; i < accel_hw_priv->evq_npages; i++) {
++                      ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i],
++                                       bend->far_end);
++              }
++              return rc;
++      }
++
++      accel_hw_priv->evq_rptr_gnt = hwinfo->evq_rptr_gnt = rc;
++      VPRINTK("evq_rptr_gnt got %d\n", hwinfo->evq_rptr_gnt);
++      
++      return 0;
++}
++
++
++static int ef_bend_hwinfo_falcon_b(struct netback_accel *bend, 
++                                 struct net_accel_hw_falcon_b *hwinfo)
++{
++      return ef_bend_hwinfo_falcon_common(bend, hwinfo);
++}
++
++
++/*
++ * Fill in the message with a description of the hardware resources, based on
++ * the H/W type
++ */
++static int netback_accel_hwinfo(struct netback_accel *bend, 
++                              struct net_accel_msg_hw *msgvi)
++{
++      int rc = 0;
++      
++      BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_FILTER);
++
++      msgvi->type = bend->hw_type;
++      switch (bend->hw_type) {
++      case NET_ACCEL_MSG_HWTYPE_FALCON_A:
++              rc = ef_bend_hwinfo_falcon_a(bend, &msgvi->resources.falcon_a);
++              break;
++      case NET_ACCEL_MSG_HWTYPE_FALCON_B:
++      case NET_ACCEL_MSG_HWTYPE_SIENA_A:
++              rc = ef_bend_hwinfo_falcon_b(bend, &msgvi->resources.falcon_b);
++              break;
++      case NET_ACCEL_MSG_HWTYPE_NONE:
++              /* Nothing to do. The slow path should just work. */
++              break;
++      }
++
++      if (rc == 0)
++              bend->hw_state = NETBACK_ACCEL_RES_HWINFO;
++              
++      return rc;
++}
++
++
++/* Allocate hardware resources and make them available to the client domain */
++int netback_accel_setup_vnic_hw(struct netback_accel *bend)
++{
++      struct net_accel_msg msg;
++      int err;
++
++      /* Allocate the event queue, VI and so on. */
++      err = ef_get_vnic(bend);
++      if (err) {
++              EPRINTK("Failed to allocate hardware resource for bend:"
++                      "error %d\n", err);
++              return err;
++      }
++
++      /* Set up the filter management */
++      err = netback_accel_filter_init(bend);
++      if (err) {
++              EPRINTK("Filter setup failed, error %d", err);
++              ef_free_vnic(bend);
++              return err;
++      }
++
++      net_accel_msg_init(&msg, NET_ACCEL_MSG_SETHW);
++
++      /*
++       * Extract the low-level hardware info we will actually pass to the
++       * other end, and set up the grants/ioremap permissions needed
++       */
++      err = netback_accel_hwinfo(bend, &msg.u.hw);
++
++      if (err != 0) {
++              netback_accel_filter_shutdown(bend);
++              ef_free_vnic(bend);
++              return err;
++      }
++
++      /* Send the message, this is a reply to a hello-reply */
++      err = net_accel_msg_reply_notify(bend->shared_page, 
++                                       bend->msg_channel_irq, 
++                                       &bend->to_domU, &msg);
++
++      /*
++       * The message should succeed as it's logically a reply and we
++       * guarantee space for replies, but a misbehaving frontend
++       * could result in that behaviour, so be tolerant
++       */
++      if (err != 0) {
++              netback_accel_release_hwinfo(bend);
++              netback_accel_filter_shutdown(bend);
++              ef_free_vnic(bend);
++      }
++
++      return err;
++}
++
++
++/* Free hardware resources  */
++void netback_accel_shutdown_vnic_hw(struct netback_accel *bend)
++{
++      /*
++       * Only try and release resources if accel_hw_priv was setup,
++       * otherwise there is nothing to do as we're on "null-op"
++       * acceleration
++       */
++      switch (bend->hw_state) {
++      case NETBACK_ACCEL_RES_HWINFO:
++              VPRINTK("Release hardware resources\n");
++              netback_accel_release_hwinfo(bend);
++              /* deliberate drop through */
++      case NETBACK_ACCEL_RES_FILTER:          
++              VPRINTK("Free filters...\n");
++              netback_accel_filter_shutdown(bend);
++              /* deliberate drop through */
++      case NETBACK_ACCEL_RES_ALLOC:
++              VPRINTK("Free vnic...\n");
++              ef_free_vnic(bend);
++              /* deliberate drop through */
++      case NETBACK_ACCEL_RES_NONE:
++              break;
++      default:
++              BUG();
++      }
++}
++
++/**************************************************************************
++ * 
++ * Buffer table stuff
++ *
++ **************************************************************************/
++
++/*
++ * Undo any allocation that netback_accel_msg_rx_buffer_map() has made
++ * if it fails half way through
++ */
++static inline void buffer_map_cleanup(struct netback_accel *bend, int i)
++{
++      while (i > 0) {
++              i--;
++              bend->buffer_maps_index--;
++              net_accel_unmap_device_page(bend->hdev_data, 
++                                          bend->buffer_maps[bend->buffer_maps_index],
++                                          bend->buffer_addrs[bend->buffer_maps_index]);
++      }
++}
++
++
++int netback_accel_add_buffers(struct netback_accel *bend, int pages, int log2_pages,
++                            u32 *grants, u32 *buf_addr_out)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
++      unsigned long long addr_array[NET_ACCEL_MSG_MAX_PAGE_REQ];
++      int rc, i, index;
++      u64 dev_bus_addr;
++
++      /* Make sure we can't overflow the dma_maps array */
++      if (accel_hw_priv->dma_maps_index >= 
++          bend->max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ) {
++              EPRINTK("%s: too many buffer table allocations: %d %d\n",
++                      __FUNCTION__, accel_hw_priv->dma_maps_index, 
++                      bend->max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ);
++              return -EINVAL;
++      }
++
++      /* Make sure we can't overflow the buffer_maps array */
++      if (bend->buffer_maps_index + pages > bend->max_pages) {
++              EPRINTK("%s: too many pages mapped: %d + %d > %d\n", 
++                      __FUNCTION__, bend->buffer_maps_index,
++                      pages, bend->max_pages);
++              return -EINVAL;
++      }
++
++      for (i = 0; i < pages; i++) {
++              VPRINTK("%s: mapping page %d\n", __FUNCTION__, i);
++              rc = net_accel_map_device_page
++                      (bend->hdev_data, grants[i],
++                       &bend->buffer_maps[bend->buffer_maps_index],
++                       &dev_bus_addr);
++    
++              if (rc != 0) {
++                      EPRINTK("error in net_accel_map_device_page\n");
++                      buffer_map_cleanup(bend, i);
++                      return rc;
++              }
++              
++              bend->buffer_addrs[bend->buffer_maps_index] = dev_bus_addr;
++
++              bend->buffer_maps_index++;
++
++              addr_array[i] = dev_bus_addr;
++      }
++
++      VPRINTK("%s: mapping dma addresses to vih %p\n", __FUNCTION__, 
++              accel_hw_priv->efx_vih);
++
++      index = accel_hw_priv->dma_maps_index;
++      if ((rc = efx_vi_dma_map_addrs(accel_hw_priv->efx_vih, addr_array, pages,
++                                     &(accel_hw_priv->dma_maps[index]))) < 0) {
++              EPRINTK("error in dma_map_pages\n");
++              buffer_map_cleanup(bend, i);
++              return rc;
++      }
++
++      accel_hw_priv->dma_maps_index++;
++      NETBACK_ACCEL_STATS_OP(bend->stats.num_buffer_pages += pages);
++
++      //DPRINTK("%s: getting map address\n", __FUNCTION__);
++
++      *buf_addr_out = efx_vi_dma_get_map_addr(accel_hw_priv->efx_vih, 
++                                              accel_hw_priv->dma_maps[index]);
++
++      //DPRINTK("%s: done\n", __FUNCTION__);
++
++      return 0;
++}
++
++
++int netback_accel_remove_buffers(struct netback_accel *bend)
++{
++      /* Only try to free buffers if accel_hw_priv was setup */
++      if (bend->hw_state != NETBACK_ACCEL_RES_NONE) {
++              struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
++              int i;
++
++              efx_vi_reset(accel_hw_priv->efx_vih);
++
++              while (accel_hw_priv->dma_maps_index > 0) {
++                      accel_hw_priv->dma_maps_index--;
++                      i = accel_hw_priv->dma_maps_index;
++                      efx_vi_dma_unmap_addrs(accel_hw_priv->efx_vih, 
++                                             accel_hw_priv->dma_maps[i]);
++              }
++              
++              while (bend->buffer_maps_index > 0) {
++                      VPRINTK("Unmapping granted buffer %d\n", 
++                              bend->buffer_maps_index);
++                      bend->buffer_maps_index--;
++                      i = bend->buffer_maps_index;
++                      net_accel_unmap_device_page(bend->hdev_data, 
++                                                  bend->buffer_maps[i],
++                                                  bend->buffer_addrs[i]);
++              }
++
++              NETBACK_ACCEL_STATS_OP(bend->stats.num_buffer_pages = 0);
++      }
++
++      return 0;
++}
++
++/**************************************************************************
++ * 
++ * Filter stuff
++ *
++ **************************************************************************/
++
++static int netback_accel_filter_init(struct netback_accel *bend)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
++      int i, rc;
++
++      BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_ALLOC);
++
++      spin_lock_init(&accel_hw_priv->filter_lock);
++
++      if ((rc = cuckoo_hash_init(&accel_hw_priv->filter_hash_table, 
++                                 5 /* space for 32 filters */, 8)) != 0) {
++              EPRINTK("Failed to initialise filter hash table\n");
++              return rc;
++      }
++
++      accel_hw_priv->fspecs = kzalloc(sizeof(struct netback_accel_filter_spec) *
++                                      bend->quotas.max_filters,
++                                      GFP_KERNEL);
++
++      if (accel_hw_priv->fspecs == NULL) {
++              EPRINTK("No memory for filter specs.\n");
++              cuckoo_hash_destroy(&accel_hw_priv->filter_hash_table);
++              return -ENOMEM;
++      }
++
++      for (i = 0; i < bend->quotas.max_filters; i++) {
++              accel_hw_priv->free_filters |= (1 << i);
++      }
++
++      /* Base mask on highest set bit in max_filters  */
++      accel_hw_priv->filter_idx_mask = (1 << fls(bend->quotas.max_filters)) - 1;
++      VPRINTK("filter setup: max is %x mask is %x\n",
++              bend->quotas.max_filters, accel_hw_priv->filter_idx_mask);
++
++      bend->hw_state = NETBACK_ACCEL_RES_FILTER;
++
++      return 0;
++}
++
++
++static inline void make_filter_key(cuckoo_hash_ip_key *key,  
++                                 struct netback_accel_filter_spec *filt)
++
++{
++      key->local_ip = filt->destip_be;
++      key->local_port = filt->destport_be;
++      key->proto = filt->proto;
++}
++
++
++static inline 
++void netback_accel_free_filter(struct falcon_bend_accel_priv *accel_hw_priv,
++                             int filter)
++{
++      cuckoo_hash_ip_key filter_key;
++
++      if (!(accel_hw_priv->free_filters & (1 << filter))) {
++              efx_vi_filter_stop(accel_hw_priv->efx_vih, 
++                                 accel_hw_priv->fspecs[filter].filter_handle);
++              make_filter_key(&filter_key, &(accel_hw_priv->fspecs[filter]));
++              if (cuckoo_hash_remove(&accel_hw_priv->filter_hash_table,
++                                     (cuckoo_hash_key *)&filter_key)) {
++                      EPRINTK("%s: Couldn't find filter to remove from table\n",
++                              __FUNCTION__);
++                      BUG();
++              }
++      }
++}
++
++
++static void netback_accel_filter_shutdown(struct netback_accel *bend)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
++      int i;
++      unsigned long flags;
++
++      BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_FILTER);
++
++      spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
++
++      BUG_ON(accel_hw_priv->fspecs == NULL);
++
++      for (i = 0; i < bend->quotas.max_filters; i++) {
++              netback_accel_free_filter(accel_hw_priv, i);
++      }
++      
++      kfree(accel_hw_priv->fspecs);
++      accel_hw_priv->fspecs = NULL;
++      accel_hw_priv->free_filters = 0;
++      
++      cuckoo_hash_destroy(&accel_hw_priv->filter_hash_table);
++
++      spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
++
++      bend->hw_state = NETBACK_ACCEL_RES_ALLOC;
++}
++
++
++/*! Suggest a filter to replace when we want to insert a new one and have
++ *  none free.
++ */
++static unsigned get_victim_filter(struct netback_accel *bend)
++{
++      /*
++       * We could attempt to get really clever, and may do at some
++       * point, but random replacement is v. cheap and low on
++       * pathological worst cases.
++       */
++      unsigned index, cycles;
++
++      rdtscl(cycles);
++
++      /*
++       * Some doubt about the quality of the bottom few bits, so
++       * throw 'em * away
++       */
++      index = (cycles >> 4) & ((struct falcon_bend_accel_priv *)
++                               bend->accel_hw_priv)->filter_idx_mask;
++      /*
++       * We don't enforce that the number of filters is a power of
++       * two, but the masking gets us to within one subtraction of a
++       * valid index
++       */
++      if (index >= bend->quotas.max_filters)
++              index -= bend->quotas.max_filters;
++      DPRINTK("backend %s->%d has no free filters. Filter %d will be evicted\n",
++              bend->nicname, bend->far_end, index);
++      return index;
++}
++
++
++/* Add a filter for the specified IP/port to the backend */
++int 
++netback_accel_filter_check_add(struct netback_accel *bend, 
++                             struct netback_accel_filter_spec *filt)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
++      struct netback_accel_filter_spec *fs;
++      unsigned filter_index;
++      unsigned long flags;
++      int rc, recycling = 0;
++      cuckoo_hash_ip_key filter_key, evict_key;
++
++      BUG_ON(filt->proto != IPPROTO_TCP && filt->proto != IPPROTO_UDP);
++
++      DPRINTK("Will add %s filter for dst ip %08x and dst port %d\n", 
++              (filt->proto == IPPROTO_TCP) ? "TCP" : "UDP",
++              be32_to_cpu(filt->destip_be), be16_to_cpu(filt->destport_be));
++
++      spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
++      /*
++       * Check to see if we're already filtering this IP address and
++       * port. Happens if you insert a filter mid-stream as there
++       * are many packets backed up to be delivered to dom0 already
++       */
++      make_filter_key(&filter_key, filt);
++      if (cuckoo_hash_lookup(&accel_hw_priv->filter_hash_table, 
++                             (cuckoo_hash_key *)(&filter_key), 
++                             &filter_index)) {
++              DPRINTK("Found matching filter %d already in table\n", 
++                      filter_index);
++              rc = -1;
++              goto out;
++      }
++
++      if (accel_hw_priv->free_filters == 0) {
++              filter_index = get_victim_filter(bend);
++              recycling = 1;
++      } else {
++              filter_index = __ffs(accel_hw_priv->free_filters);
++              clear_bit(filter_index, &accel_hw_priv->free_filters);
++      }
++
++      fs = &accel_hw_priv->fspecs[filter_index];
++
++      if (recycling) {
++              DPRINTK("Removing filter index %d handle %p\n", filter_index,
++                      fs->filter_handle);
++
++              if ((rc = efx_vi_filter_stop(accel_hw_priv->efx_vih, 
++                                           fs->filter_handle)) != 0) {
++                      EPRINTK("Couldn't clear NIC filter table entry %d\n", rc);
++              }
++
++              make_filter_key(&evict_key, fs);
++              if (cuckoo_hash_remove(&accel_hw_priv->filter_hash_table,
++                                     (cuckoo_hash_key *)&evict_key)) {
++                      EPRINTK("Couldn't find filter to remove from table\n");
++                      BUG();
++              }
++              NETBACK_ACCEL_STATS_OP(bend->stats.num_filters--);
++      }
++
++      /* Update the filter spec with new details */
++      *fs = *filt;
++
++      if ((rc = cuckoo_hash_add(&accel_hw_priv->filter_hash_table, 
++                                (cuckoo_hash_key *)&filter_key, filter_index,
++                                1)) != 0) {
++              EPRINTK("Error (%d) adding filter to table\n", rc);
++              accel_hw_priv->free_filters |= (1 << filter_index);
++              goto out;
++      }
++
++      rc = efx_vi_filter(accel_hw_priv->efx_vih, filt->proto, filt->destip_be,
++                         filt->destport_be, 
++                         (struct filter_resource_t **)&fs->filter_handle);
++
++      if (rc != 0) {
++              EPRINTK("Hardware filter insertion failed. Error %d\n", rc);
++              accel_hw_priv->free_filters |= (1 << filter_index);
++              cuckoo_hash_remove(&accel_hw_priv->filter_hash_table, 
++                                 (cuckoo_hash_key *)&filter_key);
++              rc = -1;
++              goto out;
++      }
++
++      NETBACK_ACCEL_STATS_OP(bend->stats.num_filters++);
++
++      VPRINTK("%s: success index %d handle %p\n", __FUNCTION__, filter_index, 
++              fs->filter_handle);
++
++      rc = filter_index;
++ out:
++      spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
++      return rc;
++}
++
++
++/* Remove a filter entry for the specific device and IP/port */
++static void netback_accel_filter_remove(struct netback_accel *bend, 
++                                      int filter_index)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
++
++      BUG_ON(accel_hw_priv->free_filters & (1 << filter_index));
++      netback_accel_free_filter(accel_hw_priv, filter_index);
++      accel_hw_priv->free_filters |= (1 << filter_index);
++}
++
++
++/* Remove a filter entry for the specific device and IP/port */
++void netback_accel_filter_remove_spec(struct netback_accel *bend, 
++                                    struct netback_accel_filter_spec *filt)
++{
++      struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
++      unsigned filter_found;
++      unsigned long flags;
++      cuckoo_hash_ip_key filter_key;
++      struct netback_accel_filter_spec *fs;
++
++      if (filt->proto == IPPROTO_TCP) {
++              DPRINTK("Remove TCP filter for dst ip %08x and dst port %d\n",
++                      be32_to_cpu(filt->destip_be),
++                      be16_to_cpu(filt->destport_be));
++      } else if (filt->proto == IPPROTO_UDP) {
++              DPRINTK("Remove UDP filter for dst ip %08x and dst port %d\n",
++                      be32_to_cpu(filt->destip_be),
++                      be16_to_cpu(filt->destport_be));
++      } else {
++              /*
++               * This could be provoked by an evil frontend, so can't
++               * BUG(), but harmless as it should fail tests below 
++               */
++              DPRINTK("Non-TCP/UDP filter dst ip %08x and dst port %d\n",
++                      be32_to_cpu(filt->destip_be),
++                      be16_to_cpu(filt->destport_be));
++      }
++
++      spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
++
++      make_filter_key(&filter_key, filt);
++      if (!cuckoo_hash_lookup(&accel_hw_priv->filter_hash_table, 
++                             (cuckoo_hash_key *)(&filter_key), 
++                             &filter_found)) {
++              EPRINTK("Couldn't find matching filter already in table\n");
++              goto out;
++      }
++      
++      /* Do a full check to make sure we've not had a hash collision */
++      fs = &accel_hw_priv->fspecs[filter_found];
++      if (fs->destip_be == filt->destip_be &&
++          fs->destport_be == filt->destport_be &&
++          fs->proto == filt->proto &&
++          !memcmp(fs->mac, filt->mac, ETH_ALEN)) {
++              netback_accel_filter_remove(bend, filter_found);
++      } else {
++              EPRINTK("Entry in hash table does not match filter spec\n");
++              goto out;
++      }
++
++ out:
++      spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
++}
diff --cc drivers/xen/sfc_netback/accel_solarflare.h

index 0000000,0000000..84d2146

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_solarflare.h
@@@ -1,0 -1,0 +1,88 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#ifndef NETBACK_ACCEL_SOLARFLARE_H
++#define NETBACK_ACCEL_SOLARFLARE_H
++
++#include "accel.h"
++#include "accel_msg_iface.h"
++
++#include "driverlink_api.h"
++
++#define MAX_NICS 5
++#define MAX_PORTS 2
++
++
++extern int netback_accel_sf_init(void);
++extern void netback_accel_sf_shutdown(void);
++extern int netback_accel_sf_hwtype(struct netback_accel *bend);
++
++extern int netback_accel_sf_char_init(void);
++extern void netback_accel_sf_char_shutdown(void);
++
++extern int netback_accel_setup_vnic_hw(struct netback_accel *bend);
++extern void netback_accel_shutdown_vnic_hw(struct netback_accel *bend);
++
++extern int netback_accel_add_buffers(struct netback_accel *bend, int pages, 
++                                   int log2_pages, u32 *grants,
++                                   u32 *buf_addr_out);
++extern int netback_accel_remove_buffers(struct netback_accel *bend);
++
++
++/* Add a filter for the specified IP/port to the backend */
++extern int
++netback_accel_filter_check_add(struct netback_accel *bend, 
++                             struct netback_accel_filter_spec *filt);
++/* Remove a filter entry for the specific device and IP/port */
++extern
++void netback_accel_filter_remove_index(struct netback_accel *bend, 
++                                     int filter_index);
++extern
++void netback_accel_filter_remove_spec(struct netback_accel *bend, 
++                                    struct netback_accel_filter_spec *filt);
++
++/* This is designed to look a bit like a skb */
++struct netback_pkt_buf {
++      union {
++              unsigned char *raw;
++      } mac;
++      union {
++              struct iphdr  *iph;
++              struct arphdr *arph;
++              unsigned char *raw;
++      } nh;
++      int protocol;
++};
++
++/*! \brief Handle a received packet: insert fast path filters as necessary
++ * \param skb The packet buffer
++ */
++extern void netback_accel_rx_packet(struct netback_pkt_buf *skb, void *fwd_priv);
++
++/*! \brief Handle a transmitted packet: update fast path filters as necessary
++ * \param skb The packet buffer
++ */
++extern void netback_accel_tx_packet(struct sk_buff *skb, void *fwd_priv);
++
++#endif /* NETBACK_ACCEL_SOLARFLARE_H */
diff --cc drivers/xen/sfc_netback/accel_xenbus.c

index 0000000,0000000..4fb82d8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_xenbus.c
@@@ -1,0 -1,0 +1,831 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <xen/evtchn.h>
++#include <linux/mutex.h>
++#include <linux/delay.h>
++
++/* drivers/xen/netback/common.h */
++#include "common.h"
++
++#include "accel.h"
++#include "accel_solarflare.h"
++#include "accel_util.h"
++
++#define NODENAME_PATH_FMT "backend/vif/%d/%d"
++
++#define NETBACK_ACCEL_FROM_XENBUS_DEVICE(_dev) (struct netback_accel *) \
++      ((struct backend_info *)dev_get_drvdata(&(_dev)->dev))->netback_accel_priv
++
++/* List of all the bends currently in existence. */
++struct netback_accel *bend_list = NULL;
++DEFINE_MUTEX(bend_list_mutex);
++
++/* Put in bend_list.  Must hold bend_list_mutex */
++static void link_bend(struct netback_accel *bend)
++{
++      bend->next_bend = bend_list;
++      bend_list = bend;
++}
++
++/* Remove from bend_list,  Must hold bend_list_mutex */
++static void unlink_bend(struct netback_accel *bend)
++{
++      struct netback_accel *tmp = bend_list;
++      struct netback_accel *prev = NULL;
++      while (tmp != NULL) {
++              if (tmp == bend) {
++                      if (prev != NULL)
++                              prev->next_bend = bend->next_bend;
++                      else
++                              bend_list = bend->next_bend;
++                      return;
++              }
++              prev = tmp;
++              tmp = tmp->next_bend;
++      }
++}
++
++
++/* Demultiplex a message IRQ from the frontend driver.  */
++static irqreturn_t msgirq_from_frontend(int irq, void *context)
++{
++      struct xenbus_device *dev = context;
++      struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
++      VPRINTK("irq %d from device %s\n", irq, dev->nodename);
++      schedule_work(&bend->handle_msg);
++      return IRQ_HANDLED;
++}
++
++
++/*
++ * Demultiplex an IRQ from the frontend driver.  This is never used
++ * functionally, but we need it to pass to the bind function, and may
++ * get called spuriously
++ */
++static irqreturn_t netirq_from_frontend(int irq, void *context)
++{
++      VPRINTK("netirq %d from device %s\n", irq,
++              ((struct xenbus_device *)context)->nodename);
++      
++      return IRQ_HANDLED;
++}
++
++
++/* Read the limits values of the xenbus structure. */
++static 
++void cfg_hw_quotas(struct xenbus_device *dev, struct netback_accel *bend)
++{
++      int err = xenbus_gather
++              (XBT_NIL, dev->nodename,
++               "limits/max-filters", "%d", &bend->quotas.max_filters,
++               "limits/max-buf-pages", "%d", &bend->quotas.max_buf_pages,
++               "limits/max-mcasts", "%d", &bend->quotas.max_mcasts,
++               NULL);
++      if (err) {
++              /*
++               * TODO what if they have previously been set by the
++               * user?  This will overwrite with defaults.  Maybe
++               * not what we want to do, but useful in startup
++               * case 
++               */
++              DPRINTK("Failed to read quotas from xenbus, using defaults\n");
++              bend->quotas.max_filters = NETBACK_ACCEL_DEFAULT_MAX_FILTERS;
++              bend->quotas.max_buf_pages = sfc_netback_max_pages;
++              bend->quotas.max_mcasts = NETBACK_ACCEL_DEFAULT_MAX_MCASTS;
++      }
++
++      return;
++}
++
++
++static void bend_config_accel_change(struct xenbus_watch *watch,
++                                   const char **vec, unsigned int len)
++{
++      struct netback_accel *bend;
++
++      bend = container_of(watch, struct netback_accel, config_accel_watch);
++
++      mutex_lock(&bend->bend_mutex);
++      if (bend->config_accel_watch.node != NULL) {
++              struct xenbus_device *dev = 
++                      (struct xenbus_device *)bend->hdev_data;
++              DPRINTK("Watch matched, got dev %p otherend %p\n",
++                      dev, dev->otherend);
++              if(!xenbus_exists(XBT_NIL, watch->node, "")) {
++                      DPRINTK("Ignoring watch as otherend seems invalid\n");
++                      goto out;
++              }
++              
++              cfg_hw_quotas(dev, bend);
++      }
++ out:
++      mutex_unlock(&bend->bend_mutex);
++      return;
++}
++
++
++/*
++ * Setup watch on "limits" in the backend vif info to know when
++ * configuration has been set
++ */
++static int setup_config_accel_watch(struct xenbus_device *dev,
++                                  struct netback_accel *bend)
++{
++      int err;
++
++      VPRINTK("Setting watch on %s/%s\n", dev->nodename, "limits");
++
++      err = xenbus_watch_path2(dev, dev->nodename, "limits", 
++                               &bend->config_accel_watch, 
++                               bend_config_accel_change);
++
++      if (err) {
++              EPRINTK("%s: Failed to register xenbus watch: %d\n",
++                      __FUNCTION__, err);
++              bend->config_accel_watch.node = NULL;
++              return err;
++      }
++      return 0;
++}
++
++
++static int 
++cfg_frontend_info(struct xenbus_device *dev, struct netback_accel *bend,
++                int *grants)
++{
++      /* Get some info from xenbus on the event channel and shmem grant */
++      int err = xenbus_gather(XBT_NIL, dev->otherend, 
++                              "accel-msg-channel", "%u", &bend->msg_channel, 
++                              "accel-ctrl-page", "%d", &(grants[0]),
++                              "accel-msg-page", "%d", &(grants[1]),
++                              "accel-net-channel", "%u", &bend->net_channel,
++                              NULL);
++      if (err)
++              EPRINTK("failed to read event channels or shmem grant: %d\n",
++                      err);
++      else
++              DPRINTK("got event chan %d and net chan %d from frontend\n",
++                      bend->msg_channel, bend->net_channel);
++      return err;
++}
++
++
++/* Setup all the comms needed to chat with the front end driver */
++static int setup_vnic(struct xenbus_device *dev)
++{
++      struct netback_accel *bend;
++      int grants[2], err, msgs_per_queue;
++
++      bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
++
++      err = cfg_frontend_info(dev, bend, grants);
++      if (err)
++              goto fail1;
++
++      /*
++       * If we get here, both frontend Connected and configuration
++       * options available.  All is well.
++       */
++
++      /* Get the hardware quotas for the VNIC in question.  */
++      cfg_hw_quotas(dev, bend);
++
++      /* Set up the deferred work handlers */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
++      INIT_WORK(&bend->handle_msg, 
++                netback_accel_msg_rx_handler);
++#else
++      INIT_WORK(&bend->handle_msg, 
++                netback_accel_msg_rx_handler,
++                (void*)bend);
++#endif
++
++      /* Request the frontend mac */
++      err = net_accel_xen_net_read_mac(dev, bend->mac);
++      if (err)
++              goto fail2;
++
++      /* Set up the shared page. */
++      bend->shared_page = net_accel_map_grants_contig(dev, grants, 2, 
++                                                      &bend->sh_pages_unmap);
++
++      if (bend->shared_page == NULL) {
++              EPRINTK("failed to map shared page for %s\n", dev->otherend);
++              err = -ENOMEM;
++              goto fail2;
++      }
++
++      /* Initialise the shared page(s) used for comms */
++      net_accel_msg_init_page(bend->shared_page, PAGE_SIZE, 
++                              (bend->net_dev->flags & IFF_UP) && 
++                              (netif_carrier_ok(bend->net_dev)));
++
++      msgs_per_queue = (PAGE_SIZE/2) / sizeof(struct net_accel_msg);
++
++      net_accel_msg_init_queue
++              (&bend->to_domU, &bend->shared_page->queue0,
++               (struct net_accel_msg *)((__u8*)bend->shared_page + PAGE_SIZE),
++               msgs_per_queue);
++
++      net_accel_msg_init_queue
++              (&bend->from_domU, &bend->shared_page->queue1, 
++               (struct net_accel_msg *)((__u8*)bend->shared_page + 
++                                        (3 * PAGE_SIZE / 2)),
++               msgs_per_queue);
++
++      /* Bind the message event channel to a handler
++       *
++       * Note that we will probably get a spurious interrupt when we
++       * do this, so it must not be done until we have set up
++       * everything we need to handle it.
++       */
++      err = bind_interdomain_evtchn_to_irqhandler(dev->otherend_id,
++                                                  bend->msg_channel,
++                                                  msgirq_from_frontend,
++                                                  0,
++                                                  "netback_accel",
++                                                  dev);
++      if (err < 0) {
++              EPRINTK("failed to bind event channel: %d\n", err);
++              goto fail3;
++      }
++      else
++              bend->msg_channel_irq = err;
++
++      /* TODO: No need to bind this evtchn to an irq. */
++      err = bind_interdomain_evtchn_to_irqhandler(dev->otherend_id,
++                                                  bend->net_channel,
++                                                  netirq_from_frontend,
++                                                  0,
++                                                  "netback_accel",
++                                                  dev);
++      if (err < 0) {
++              EPRINTK("failed to bind net channel: %d\n", err);
++              goto fail4;
++      }  
++      else
++              bend->net_channel_irq = err;
++
++      /*
++       * Grab ourselves an entry in the forwarding hash table. We do
++       * this now so we don't have the embarassmesnt of sorting out
++       * an allocation failure while at IRQ. Because we pass NULL as
++       * the context, the actual hash lookup will succeed for this
++       * NIC, but the check for somewhere to forward to will
++       * fail. This is necessary to prevent forwarding before
++       * hardware resources are set up
++       */
++      err = netback_accel_fwd_add(bend->mac, NULL, bend->fwd_priv);
++      if (err) {
++              EPRINTK("failed to add to fwd hash table\n");
++              goto fail5;
++      }
++
++      /*
++       * Say hello to frontend.  Important to do this straight after
++       * obtaining the message queue as otherwise we are vulnerable
++       * to an evil frontend sending a HELLO-REPLY before we've sent
++       * the HELLO and confusing us
++       */
++      netback_accel_msg_tx_hello(bend, NET_ACCEL_MSG_VERSION);
++      return 0;
++
++ fail5:
++      unbind_from_irqhandler(bend->net_channel_irq, dev);
++ fail4:
++      unbind_from_irqhandler(bend->msg_channel_irq, dev);
++ fail3:
++      net_accel_unmap_grants_contig(dev, bend->sh_pages_unmap);
++      bend->shared_page = NULL;
++      bend->sh_pages_unmap = NULL;
++ fail2:
++ fail1:
++      return err;
++}
++
++
++static int read_nicname(struct xenbus_device *dev, struct netback_accel *bend)
++{
++      int len;
++
++      /* nic name used to select interface used for acceleration */
++      bend->nicname = xenbus_read(XBT_NIL, dev->nodename, "accel", &len);
++      if (IS_ERR(bend->nicname))
++              return PTR_ERR(bend->nicname);
++
++      return 0;
++}
++
++static const char *frontend_name = "sfc_netfront";
++
++static int publish_frontend_name(struct xenbus_device *dev)
++{
++      struct xenbus_transaction tr;
++      int err;
++      
++      /* Publish the name of the frontend driver */
++      do {
++              err = xenbus_transaction_start(&tr);
++              if (err != 0) { 
++                      EPRINTK("%s: transaction start failed\n", __FUNCTION__);
++                      return err;
++              }
++              err = xenbus_printf(tr, dev->nodename, "accel-frontend", 
++                                  "%s", frontend_name);
++              if (err != 0) {
++                      EPRINTK("%s: xenbus_printf failed\n", __FUNCTION__);
++                      xenbus_transaction_end(tr, 1);
++                      return err;
++              }
++              err = xenbus_transaction_end(tr, 0);
++      } while (err == -EAGAIN);
++      
++      if (err != 0) {
++              EPRINTK("failed to end frontend name transaction\n");
++              return err;
++      }
++      return 0;
++}
++
++
++static int unpublish_frontend_name(struct xenbus_device *dev)
++{
++      struct xenbus_transaction tr;
++      int err;
++
++      do {
++              err = xenbus_transaction_start(&tr);
++              if (err != 0)
++                      break;
++              err = xenbus_rm(tr, dev->nodename, "accel-frontend");
++              if (err != 0) {
++                      xenbus_transaction_end(tr, 1);
++                      break;
++              }
++              err = xenbus_transaction_end(tr, 0);
++      } while (err == -EAGAIN);
++
++      return err;
++}
++
++
++static void cleanup_vnic(struct netback_accel *bend)
++{
++      struct xenbus_device *dev;
++
++      dev = (struct xenbus_device *)bend->hdev_data;
++
++      DPRINTK("%s: bend %p dev %p\n", __FUNCTION__, bend, dev);
++
++      DPRINTK("%s: Remove %p's mac from fwd table...\n", 
++              __FUNCTION__, bend);
++      netback_accel_fwd_remove(bend->mac, bend->fwd_priv);
++
++      /* Free buffer table allocations */
++      netback_accel_remove_buffers(bend);
++
++      DPRINTK("%s: Release hardware resources...\n", __FUNCTION__);
++      if (bend->accel_shutdown)
++              bend->accel_shutdown(bend);
++
++      if (bend->net_channel_irq) {
++              unbind_from_irqhandler(bend->net_channel_irq, dev);
++              bend->net_channel_irq = 0;
++      }
++
++      if (bend->msg_channel_irq) {
++              unbind_from_irqhandler(bend->msg_channel_irq, dev);
++              bend->msg_channel_irq = 0;
++      }
++
++      if (bend->sh_pages_unmap) {
++              DPRINTK("%s: Unmap grants %p\n", __FUNCTION__, 
++                      bend->sh_pages_unmap);
++              net_accel_unmap_grants_contig(dev, bend->sh_pages_unmap);
++              bend->sh_pages_unmap = NULL;
++              bend->shared_page = NULL;
++      }
++}
++
++
++/*************************************************************************/
++
++/*
++ * The following code handles accelstate changes between the frontend
++ * and the backend.  It calls setup_vnic and cleanup_vnic in matching
++ * pairs in response to transitions.
++ *
++ * Valid state transitions for Dom0 are as follows:
++ *
++ * Closed->Init       on probe or in response to Init from domU
++ * Closed->Closing    on error/remove
++ *
++ * Init->Connected    in response to Connected from domU
++ * Init->Closing      on error/remove or in response to Closing from domU
++ *
++ * Connected->Closing on error/remove or in response to Closing from domU
++ *
++ * Closing->Closed    in response to Closed from domU
++ *
++ */
++
++
++static void netback_accel_frontend_changed(struct xenbus_device *dev,
++                                         XenbusState frontend_state)
++{
++      struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
++      XenbusState backend_state;
++
++      DPRINTK("%s: changing from %s to %s. nodename %s, otherend %s\n",
++              __FUNCTION__, xenbus_strstate(bend->frontend_state),
++              xenbus_strstate(frontend_state),dev->nodename, dev->otherend);
++
++      /*
++       * Ignore duplicate state changes.  This can happen if the
++       * frontend changes state twice in quick succession and the
++       * first watch fires in the backend after the second
++       * transition has completed.
++       */
++      if (bend->frontend_state == frontend_state)
++              return;
++
++      bend->frontend_state = frontend_state;
++      backend_state = bend->backend_state;
++
++      switch (frontend_state) {
++      case XenbusStateInitialising:
++              if (backend_state == XenbusStateClosed &&
++                  !bend->removing)
++                      backend_state = XenbusStateInitialising;
++              break;
++
++      case XenbusStateConnected:
++              if (backend_state == XenbusStateInitialising) {
++                      if (!bend->vnic_is_setup &&
++                          setup_vnic(dev) == 0) {
++                              bend->vnic_is_setup = 1;
++                              backend_state = XenbusStateConnected;
++                      } else {
++                              backend_state = XenbusStateClosing;
++                      }
++              }
++              break;
++
++      case XenbusStateInitWait:
++      case XenbusStateInitialised:
++      default:
++              DPRINTK("Unknown state %s (%d) from frontend.\n",
++                      xenbus_strstate(frontend_state), frontend_state);
++              /* Unknown state.  Fall through. */
++      case XenbusStateClosing:
++              if (backend_state != XenbusStateClosed)
++                      backend_state = XenbusStateClosing;
++
++              /*
++               * The bend will now persist (with watches active) in
++               * case the frontend comes back again, eg. after
++               * frontend module reload or suspend/resume
++               */
++
++              break;
++
++      case XenbusStateUnknown:
++      case XenbusStateClosed:
++              if (bend->vnic_is_setup) {
++                      bend->vnic_is_setup = 0;
++                      cleanup_vnic(bend);
++              }
++
++              if (backend_state == XenbusStateClosing)
++                      backend_state = XenbusStateClosed;
++              break;
++      }
++
++      if (backend_state != bend->backend_state) {
++              DPRINTK("Switching from state %s (%d) to %s (%d)\n",
++                      xenbus_strstate(bend->backend_state),
++                      bend->backend_state,
++                      xenbus_strstate(backend_state), backend_state);
++              bend->backend_state = backend_state;
++              net_accel_update_state(dev, backend_state);
++      }
++
++      wake_up(&bend->state_wait_queue);
++}
++
++
++/* accelstate on the frontend's xenbus node has changed */
++static void bend_domu_accel_change(struct xenbus_watch *watch,
++                                 const char **vec, unsigned int len)
++{
++      int state;
++      struct netback_accel *bend;
++
++      bend = container_of(watch, struct netback_accel, domu_accel_watch);
++      if (bend->domu_accel_watch.node != NULL) {
++              struct xenbus_device *dev = 
++                      (struct xenbus_device *)bend->hdev_data;
++              VPRINTK("Watch matched, got dev %p otherend %p\n",
++                      dev, dev->otherend);
++              /*
++               * dev->otherend != NULL check to protect against
++               * watch firing when domain goes away and we haven't
++               * yet cleaned up
++               */
++              if (!dev->otherend ||
++                  !xenbus_exists(XBT_NIL, watch->node, "") ||
++                  strncmp(dev->otherend, vec[XS_WATCH_PATH],
++                          strlen(dev->otherend))) {
++                      DPRINTK("Ignoring watch as otherend seems invalid\n");
++                      return;
++              }
++
++              mutex_lock(&bend->bend_mutex);
++
++              xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", 
++                           &state);
++              netback_accel_frontend_changed(dev, state);
++
++              mutex_unlock(&bend->bend_mutex);
++      }
++}
++
++/* Setup watch on frontend's accelstate */
++static int setup_domu_accel_watch(struct xenbus_device *dev,
++                                struct netback_accel *bend)
++{
++      int err;
++
++      VPRINTK("Setting watch on %s/%s\n", dev->otherend, "accelstate");
++
++      err = xenbus_watch_path2(dev, dev->otherend, "accelstate", 
++                               &bend->domu_accel_watch, 
++                               bend_domu_accel_change);
++      if (err) {
++              EPRINTK("%s: Failed to register xenbus watch: %d\n",
++                      __FUNCTION__, err);
++              goto fail;
++      }
++      return 0;
++ fail:
++      bend->domu_accel_watch.node = NULL;
++      return err;
++}
++
++
++int netback_accel_probe(struct xenbus_device *dev)
++{
++      struct netback_accel *bend;
++      struct backend_info *binfo;
++      int err;
++
++      DPRINTK("%s: passed device %s\n", __FUNCTION__, dev->nodename);
++
++      /* Allocate structure to store all our state... */
++      bend = kzalloc(sizeof(struct netback_accel), GFP_KERNEL);
++      if (bend == NULL) {
++              DPRINTK("%s: no memory for bend\n", __FUNCTION__);
++              return -ENOMEM;
++      }
++      
++      mutex_init(&bend->bend_mutex);
++
++      mutex_lock(&bend->bend_mutex);
++
++      /* ...and store it where we can get at it */
++      binfo = dev_get_drvdata(&dev->dev);
++      binfo->netback_accel_priv = bend;
++      /* And vice-versa */
++      bend->hdev_data = dev;
++
++      DPRINTK("%s: Adding bend %p to list\n", __FUNCTION__, bend);
++      
++      init_waitqueue_head(&bend->state_wait_queue);
++      bend->vnic_is_setup = 0;
++      bend->frontend_state = XenbusStateUnknown;
++      bend->backend_state = XenbusStateClosed;
++      bend->removing = 0;
++
++      sscanf(dev->nodename, NODENAME_PATH_FMT, &bend->far_end, 
++             &bend->vif_num);
++
++      err = read_nicname(dev, bend);
++      if (err) {
++              /*
++               * Technically not an error, just means we're not 
++               * supposed to accelerate this
++               */
++              DPRINTK("failed to get device name\n");
++              goto fail_nicname;
++      }
++
++      /*
++       * Look up the device name in the list of NICs provided by
++       * driverlink to get the hardware type.
++       */
++      err = netback_accel_sf_hwtype(bend);
++      if (err) {
++              /*
++               * Technically not an error, just means we're not
++               * supposed to accelerate this, probably belongs to
++               * some other backend
++               */
++              DPRINTK("failed to match device name\n");
++              goto fail_init_type;
++      }
++
++      err = publish_frontend_name(dev);
++      if (err)
++              goto fail_publish;
++
++      err = netback_accel_debugfs_create(bend);
++      if (err)
++              goto fail_debugfs;
++      
++      mutex_unlock(&bend->bend_mutex);
++
++      err = setup_config_accel_watch(dev, bend);
++      if (err)
++              goto fail_config_watch;
++
++      err = setup_domu_accel_watch(dev, bend);
++      if (err)
++              goto fail_domu_watch;
++
++      /*
++       * Indicate to the other end that we're ready to start unless
++       * the watch has already fired.
++       */
++      mutex_lock(&bend->bend_mutex);
++      if (bend->backend_state == XenbusStateClosed) {
++              bend->backend_state = XenbusStateInitialising;
++              net_accel_update_state(dev, XenbusStateInitialising);
++      }
++      mutex_unlock(&bend->bend_mutex);
++
++      mutex_lock(&bend_list_mutex);
++      link_bend(bend);
++      mutex_unlock(&bend_list_mutex);
++
++      return 0;
++
++fail_domu_watch:
++
++      unregister_xenbus_watch(&bend->config_accel_watch);
++      kfree(bend->config_accel_watch.node);
++fail_config_watch:
++
++      /*
++       * Flush the scheduled work queue before freeing bend to get
++       * rid of any pending netback_accel_msg_rx_handler()
++       */
++      flush_work_sync(&bend->handle_msg);
++
++      mutex_lock(&bend->bend_mutex);
++      net_accel_update_state(dev, XenbusStateUnknown);
++      netback_accel_debugfs_remove(bend);
++fail_debugfs:
++
++      unpublish_frontend_name(dev);
++fail_publish:
++
++      /* No need to reverse netback_accel_sf_hwtype. */
++fail_init_type:
++
++      kfree(bend->nicname);
++fail_nicname:
++      binfo->netback_accel_priv = NULL;
++      mutex_unlock(&bend->bend_mutex);
++      kfree(bend);
++      return err;
++}
++
++
++int netback_accel_remove(struct xenbus_device *dev)
++{
++      struct backend_info *binfo;
++      struct netback_accel *bend; 
++      int frontend_state;
++
++      binfo = dev_get_drvdata(&dev->dev);
++      bend = (struct netback_accel *) binfo->netback_accel_priv;
++
++      DPRINTK("%s: dev %p bend %p\n", __FUNCTION__, dev, bend);
++      
++      BUG_ON(bend == NULL);
++      
++      mutex_lock(&bend_list_mutex);
++      unlink_bend(bend);
++      mutex_unlock(&bend_list_mutex);
++
++      mutex_lock(&bend->bend_mutex);
++
++      /* Reject any requests to connect. */
++      bend->removing = 1;
++
++      /*
++       * Switch to closing to tell the other end that we're going
++       * away.
++       */
++      if (bend->backend_state != XenbusStateClosing) {
++              bend->backend_state = XenbusStateClosing;
++              net_accel_update_state(dev, XenbusStateClosing);
++      }
++
++      frontend_state = (int)XenbusStateUnknown;
++      xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d",
++                   &frontend_state);
++
++      mutex_unlock(&bend->bend_mutex);
++
++      /*
++       * Wait until this end goes to the closed state.  This happens
++       * in response to the other end going to the closed state.
++       * Don't bother doing this if the other end is already closed
++       * because if it is then there is nothing to do.
++       */
++      if (frontend_state != (int)XenbusStateClosed &&
++          frontend_state != (int)XenbusStateUnknown)
++              wait_event(bend->state_wait_queue,
++                         bend->backend_state == XenbusStateClosed);
++
++      unregister_xenbus_watch(&bend->domu_accel_watch);
++      kfree(bend->domu_accel_watch.node);
++
++      unregister_xenbus_watch(&bend->config_accel_watch);
++      kfree(bend->config_accel_watch.node);
++
++      /*
++       * Flush the scheduled work queue before freeing bend to get
++       * rid of any pending netback_accel_msg_rx_handler()
++       */
++      flush_work_sync(&bend->handle_msg);
++
++      mutex_lock(&bend->bend_mutex);
++
++      /* Tear down the vnic if it was set up. */
++      if (bend->vnic_is_setup) {
++              bend->vnic_is_setup = 0;
++              cleanup_vnic(bend);
++      }
++
++      bend->backend_state = XenbusStateUnknown;
++      net_accel_update_state(dev, XenbusStateUnknown);
++
++      netback_accel_debugfs_remove(bend);
++
++      unpublish_frontend_name(dev);
++
++      kfree(bend->nicname);
++
++      binfo->netback_accel_priv = NULL;
++
++      mutex_unlock(&bend->bend_mutex);
++
++      kfree(bend);
++
++      return 0;
++}
++
++
++void netback_accel_shutdown_bends(void)
++{
++      mutex_lock(&bend_list_mutex);
++      /*
++       * I think we should have had a remove callback for all
++       * interfaces before being allowed to unload the module
++       */
++      BUG_ON(bend_list != NULL);
++      mutex_unlock(&bend_list_mutex);
++}
++
++
++void netback_accel_set_closing(struct netback_accel *bend) 
++{
++
++      bend->backend_state = XenbusStateClosing;
++      net_accel_update_state((struct xenbus_device *)bend->hdev_data,
++                             XenbusStateClosing);
++}
diff --cc drivers/xen/sfc_netback/ci/compat.h

index 0000000,0000000..79f96f2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat.h
@@@ -1,0 -1,0 +1,53 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * \author  djr
++ *  \brief  Compatability layer.  Provides definitions of fundamental
++ *          types and definitions that are used throughout CI source
++ *          code.  It does not introduce any link time dependencies,
++ *          or include any unnecessary system headers.
++ */
++/*! \cidoxg_include_ci */
++
++#ifndef __CI_COMPAT_H__
++#define __CI_COMPAT_H__
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++#include <ci/compat/primitive.h>
++#include <ci/compat/sysdep.h>
++#include <ci/compat/utils.h>
++
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif  /* __CI_COMPAT_H__ */
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/compat/gcc.h

index 0000000,0000000..0cf77c4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/gcc.h
@@@ -1,0 -1,0 +1,158 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*! \cidoxg_include_ci_compat  */
++
++#ifndef __CI_COMPAT_GCC_H__
++#define __CI_COMPAT_GCC_H__
++
++
++#define CI_HAVE_INT64
++
++
++#if defined(__linux__) && defined(__KERNEL__)
++
++# include <linux/types.h>
++
++typedef __u64                 ci_uint64;
++typedef __s64                 ci_int64;
++# if BITS_PER_LONG == 32
++typedef __s32                 ci_ptr_arith_t;
++typedef __u32                 ci_uintptr_t;
++# else
++typedef __s64                 ci_ptr_arith_t;
++typedef __u64                 ci_uintptr_t;
++# endif
++
++
++/* it's not obvious to me why the below is wrong for x64_64, but
++ * gcc seems to complain on this platform
++ */
++# if defined(__ia64__)
++#  define CI_PRId64            "ld"
++#  define CI_PRIi64            "li"
++#  define CI_PRIo64            "lo"
++#  define CI_PRIu64            "lu"
++#  define CI_PRIx64            "lx"
++#  define CI_PRIX64            "lX"
++# else
++#  define CI_PRId64            "lld"
++#  define CI_PRIi64            "lli"
++#  define CI_PRIo64            "llo"
++#  define CI_PRIu64            "llu"
++#  define CI_PRIx64            "llx"
++#  define CI_PRIX64            "llX"
++# endif
++
++# define CI_PRId32            "d"
++# define CI_PRIi32            "i"
++# define CI_PRIo32            "o"
++# define CI_PRIu32            "u"
++# define CI_PRIx32            "x"
++# define CI_PRIX32            "X"
++
++#else
++
++# include <stdint.h>
++# include <inttypes.h>
++
++typedef uint64_t              ci_uint64;
++typedef int64_t               ci_int64;
++typedef intptr_t              ci_ptr_arith_t;
++typedef uintptr_t             ci_uintptr_t;
++
++# define CI_PRId64            PRId64
++# define CI_PRIi64            PRIi64
++# define CI_PRIo64            PRIo64
++# define CI_PRIu64            PRIu64
++# define CI_PRIx64            PRIx64
++# define CI_PRIX64            PRIX64
++
++# define CI_PRId32            PRId32
++# define CI_PRIi32            PRIi32
++# define CI_PRIo32            PRIo32
++# define CI_PRIu32            PRIu32
++# define CI_PRIx32            PRIx32
++# define CI_PRIX32            PRIX32
++
++#endif
++
++
++typedef ci_uint64                       ci_fixed_descriptor_t;
++
++#define from_fixed_descriptor(desc) ((ci_uintptr_t)(desc))
++#define to_fixed_descriptor(desc) ((ci_fixed_descriptor_t)(ci_uintptr_t)(desc))
++
++
++#if __GNUC__ >= 3 && !defined(__cplusplus)
++/*
++** Checks that [p_mbr] has the same type as [&c_type::mbr_name].
++*/
++# define CI_CONTAINER(c_type, mbr_name, p_mbr)                                \
++   __builtin_choose_expr(                                             \
++     __builtin_types_compatible_p(__typeof__(&((c_type*)0)->mbr_name),        \
++                               __typeof__(p_mbr)),                    \
++     __CI_CONTAINER(c_type, mbr_name, p_mbr), (void)0)
++
++# define ci_restrict  __restrict__
++#endif
++
++
++#if !defined(__KERNEL__) || defined(__unix__)
++#define CI_HAVE_NPRINTF  1
++#endif
++
++
++/* At what version was this introduced? */
++#if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ > 91)
++# define CI_LIKELY(t)    __builtin_expect((t), 1)
++# define CI_UNLIKELY(t)  __builtin_expect((t), 0)
++#endif
++
++/**********************************************************************
++ * Attributes
++ */
++#if __GNUC__ >= 3 && defined(NDEBUG)
++# define CI_HF __attribute__((visibility("hidden")))
++# define CI_HV __attribute__((visibility("hidden")))
++#else
++# define CI_HF
++# define CI_HV
++#endif
++
++#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
++# define ci_noinline  static __attribute__((__noinline__))
++/* (Linux 2.6 defines its own "noinline", so we use the "__noinline__" form) */
++#else
++# define ci_noinline  static
++#endif
++
++#define CI_ALIGN(x) __attribute__ ((aligned (x)))
++
++#define CI_PRINTF_LIKE(a,b) __attribute__((format(printf,a,b)))
++
++#endif  /* __CI_COMPAT_GCC_H__ */
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/compat/gcc_x86.h

index 0000000,0000000..438f0ba

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/gcc_x86.h
@@@ -1,0 -1,0 +1,115 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*! \cidoxg_include_ci_compat  */
++
++#ifndef __CI_COMPAT_GCC_X86_H__
++#define __CI_COMPAT_GCC_X86_H__
++
++/*
++** The facts:
++**
++**   SSE   sfence
++**   SSE2  lfence, mfence, pause
++*/
++
++/* 
++   Barriers to enforce ordering with respect to:
++
++   normal memory use: ci_wmb, ci_rmb, ci_wmb
++   IO bus access use: ci_wiob, ci_riob, ci_iob
++*/
++#if defined(__x86_64__)
++# define ci_x86_mb() __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
++#else
++# define ci_x86_mb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
++#endif
++
++/* ?? measure the impact of latency of sfence on a modern processor before we
++   take a decision on how to integrate with respect to writecombining */
++
++/* DJR: I don't think we need to add "memory" here.  It means the asm does
++** something to memory that GCC doesn't understand.  But all this does is
++** commit changes that GCC thinks have already happened.  NB. GCC will not
++** reorder across a __volatile__ __asm__ anyway.
++*/
++#define ci_gcc_fence()    __asm__ __volatile__ ("")
++
++#if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
++# define ci_x86_sfence()  __asm__ __volatile__ ("sfence")
++# define ci_x86_lfence()  __asm__ __volatile__ ("lfence")
++# define ci_x86_mfence()  __asm__ __volatile__ ("mfence")
++#else
++# define ci_x86_sfence()  __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF8")
++# define ci_x86_lfence()  __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xE8")
++# define ci_x86_mfence()  __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF0")
++#endif
++
++
++/* x86 processors to P4 Xeon store in-order unless executing streaming
++   extensions or when using writecombining 
++
++   Hence we do not define ci_wmb to use sfence by default. Requirement is that
++   we do not use writecombining to memory and any code which uses SSE
++   extensions must call sfence directly 
++
++   We need to track non intel clones which may support out of order store.
++
++*/
++
++#if CI_CPU_OOS
++# if CI_CPU_HAS_SSE
++#  define ci_wmb()    ci_x86_sfence()
++# else
++#  define ci_wmb()    ci_x86_mb()
++# endif
++#else
++# define ci_wmb()       ci_gcc_fence()
++#endif
++
++#if CI_CPU_HAS_SSE2
++# define ci_rmb()     ci_x86_lfence()
++# define ci_mb()      ci_x86_mfence()
++# define ci_riob()    ci_x86_lfence()
++# define ci_wiob()    ci_x86_sfence()
++# define ci_iob()     ci_x86_mfence()
++#else
++# if CI_CPU_HAS_SSE
++#  define ci_wiob()   ci_x86_sfence()
++# else
++#  define ci_wiob()   ci_x86_mb()
++# endif
++# define ci_rmb()     ci_x86_mb()
++# define ci_mb()      ci_x86_mb()
++# define ci_riob()    ci_x86_mb()
++# define ci_iob()     ci_x86_mb()
++#endif
++
++typedef unsigned long   ci_phys_addr_t;
++#define ci_phys_addr_fmt  "%lx"
++
++#endif  /* __CI_COMPAT_GCC_X86_H__ */
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/compat/primitive.h

index 0000000,0000000..3e58685

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/primitive.h
@@@ -1,0 -1,0 +1,77 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++/*! \cidoxg_include_ci_compat  */
++
++#ifndef __CI_COMPAT_PRIMITIVE_H__
++#define __CI_COMPAT_PRIMITIVE_H__
++
++
++/**********************************************************************
++ * Primitive types.
++ */
++
++typedef unsigned char                   ci_uint8;
++typedef char                            ci_int8;
++
++typedef unsigned short                  ci_uint16;
++typedef short                           ci_int16;
++
++typedef unsigned int                    ci_uint32;
++typedef int                             ci_int32;
++
++/* 64-bit support is platform dependent. */
++
++
++/**********************************************************************
++ * Other fancy types.
++ */
++
++typedef ci_uint8                        ci_octet;
++
++typedef enum {
++  CI_FALSE = 0,
++  CI_TRUE
++} ci_boolean_t;
++
++
++/**********************************************************************
++ * Some nice types you'd always assumed were standards.
++ * (Really, they are SYSV "standards".)
++ */
++
++#ifdef _WIN32
++typedef unsigned long                   ulong;              
++typedef unsigned int                    uint;
++typedef char*                           caddr_t;
++#elif defined(__linux__) && defined(__KERNEL__)
++#include <linux/types.h>
++#elif defined(__linux__)
++#include <sys/types.h>
++#endif
++
++
++#endif  /* __CI_COMPAT_PRIMITIVE_H__ */
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/compat/sysdep.h

index 0000000,0000000..7f7423c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/sysdep.h
@@@ -1,0 -1,0 +1,166 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*! \cidoxg_include_ci_compat  */
++
++#ifndef __CI_COMPAT_SYSDEP_H__
++#define __CI_COMPAT_SYSDEP_H__
++
++
++/**********************************************************************
++ * Platform definition fixups.
++ */
++
++#if defined(__ci_ul_driver__) && !defined(__ci_driver__)
++# define __ci_driver__
++#endif
++
++#if defined(__ci_driver__) && !defined(__ci_ul_driver__) && \
++   !defined(__KERNEL__)
++# define __KERNEL__
++#endif
++
++
++/**********************************************************************
++ * Sanity checks (no cheating!)
++ */
++
++#if defined(__KERNEL__) && !defined(__ci_driver__)
++# error Insane.
++#endif
++
++#if defined(__KERNEL__) && defined(__ci_ul_driver__)
++# error Madness.
++#endif
++
++#if defined(__unix__) && defined(_WIN32)
++# error Strange.
++#endif
++
++#if defined(__GNUC__) && defined(_MSC_VER)
++# error Crazy.
++#endif
++
++
++/**********************************************************************
++ * Compiler and processor dependencies.
++ */
++
++#if defined(__GNUC__)
++
++# include <ci/compat/gcc.h>
++
++# if defined(__i386__)
++#  include <ci/compat/x86.h>
++#  include <ci/compat/gcc_x86.h>
++# elif defined(__x86_64__)
++#  include <ci/compat/x86_64.h>
++#  include <ci/compat/gcc_x86.h>
++# elif defined(__PPC__)
++#  include <ci/compat/ppc.h>
++#  include <ci/compat/gcc_ppc.h>
++# elif defined(__ia64__)
++#  include <ci/compat/ia64.h>
++#  include <ci/compat/gcc_ia64.h>
++# else
++#  error Unknown processor - GNU C
++# endif
++
++#elif defined(_MSC_VER)
++
++# include <ci/compat/msvc.h>
++
++# if defined(__i386__)
++#  include <ci/compat/x86.h>
++#  include <ci/compat/msvc_x86.h>
++# elif defined(__x86_64__)
++#  include <ci/compat/x86_64.h>
++#  include <ci/compat/msvc_x86_64.h>
++# else
++#  error Unknown processor MSC
++# endif
++
++#elif defined(__PGI)
++
++# include <ci/compat/x86.h>
++# include <ci/compat/pg_x86.h>
++
++#elif defined(__INTEL_COMPILER)
++
++/* Intel compilers v7 claim to be very gcc compatible. */
++# if __INTEL_COMPILER >= 700
++#  include <ci/compat/gcc.h>
++#  include <ci/compat/x86.h>
++#  include <ci/compat/gcc_x86.h>
++# else
++#  error Old Intel compiler not supported.  Yet.
++# endif
++
++#else
++# error Unknown compiler.
++#endif
++
++
++/**********************************************************************
++ * Misc stuff (that probably shouldn't be here).
++ */
++
++#ifdef __sun
++# ifdef __KERNEL__
++#  define _KERNEL
++#  define _SYSCALL32
++#  ifdef _LP64
++#   define _SYSCALL32_IMPL
++#  endif
++# else
++#  define _REENTRANT
++# endif
++#endif
++
++
++/**********************************************************************
++ * Defaults for anything left undefined.
++ */
++
++#ifndef  CI_LIKELY
++# define CI_LIKELY(t)    (t)
++# define CI_UNLIKELY(t)  (t)
++#endif
++
++#ifndef  ci_restrict
++# define ci_restrict
++#endif
++
++#ifndef  ci_inline
++# define ci_inline  static inline
++#endif
++
++#ifndef  ci_noinline
++# define ci_noinline  static
++#endif
++
++#endif  /* __CI_COMPAT_SYSDEP_H__ */
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/compat/utils.h

index 0000000,0000000..34d4c99

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/utils.h
@@@ -1,0 -1,0 +1,269 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * \author  djr
++ *  \brief  Handy utility macros.
++ *   \date  2003/01/17
++ */
++
++/*! \cidoxg_include_ci_compat  */
++
++#ifndef __CI_COMPAT_UTILS_H__
++#define __CI_COMPAT_UTILS_H__
++
++
++/**********************************************************************
++ * Alignment -- [align] must be a power of 2.
++ **********************************************************************/
++
++  /*! Align forward onto next boundary. */
++
++#define CI_ALIGN_FWD(p, align)               (((p)+(align)-1u) & ~((align)-1u))
++
++
++  /*! Align back onto prev boundary. */
++
++#define CI_ALIGN_BACK(p, align)              ((p) & ~((align)-1u))
++
++
++  /*! How far to next boundary? */
++
++#define CI_ALIGN_NEEDED(p, align, signed_t)  (-(signed_t)(p) & ((align)-1u))
++
++
++  /*! How far beyond prev boundary? */
++
++#define CI_OFFSET(p, align)                  ((p) & ((align)-1u))
++
++
++  /*! Does object fit in gap before next boundary? */
++
++#define CI_FITS(p, size, align, signed_t)                     \
++  (CI_ALIGN_NEEDED((p) + 1, (align), signed_t) + 1 >= (size))
++
++
++  /*! Align forward onto next boundary. */
++
++#define CI_PTR_ALIGN_FWD(p, align)                                       \
++  ((char*) CI_ALIGN_FWD(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))
++
++  /*! Align back onto prev boundary. */
++
++#define CI_PTR_ALIGN_BACK(p, align)                                       \
++  ((char*) CI_ALIGN_BACK(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))
++
++  /*! How far to next boundary? */
++
++#define CI_PTR_ALIGN_NEEDED(p, align)                                 \
++  CI_ALIGN_NEEDED(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)),   \
++                ci_ptr_arith_t)
++
++  /*! How far to next boundary? NZ = not zero i.e. give align if on boundary  */
++
++#define CI_PTR_ALIGN_NEEDED_NZ(p, align)                                      \
++  ((align) - (((char*)p) -                                                      \
++  ((char*) CI_ALIGN_BACK(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))))
++
++  /*! How far beyond prev boundary? */
++
++#define CI_PTR_OFFSET(p, align)                                       \
++  CI_OFFSET(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)))
++
++
++  /* Same as CI_ALIGN_FWD and CI_ALIGN_BACK. */
++
++#define CI_ROUND_UP(i, align)      (((i)+(align)-1u) & ~((align)-1u))
++
++#define CI_ROUND_DOWN(i, align)    ((i) & ~((align)-1u))
++
++
++/**********************************************************************
++ * Byte-order
++ **********************************************************************/
++
++/* These are not flags.  They are enumeration values for use with
++ * CI_MY_BYTE_ORDER. */
++#define CI_BIG_ENDIAN          1
++#define CI_LITTLE_ENDIAN       0
++
++/*
++** Note that these byte-swapping primitives may leave junk in bits above
++** the range they operate on.
++**
++** The CI_BSWAP_nn() routines require that bits above [nn] are zero.  Use
++** CI_BSWAPM_nn(x) if this cannot be guaranteed.
++*/
++
++/* ?? May be able to improve on some of these with inline assembler on some
++** platforms.
++*/
++
++#define CI_BSWAP_16(v)    ((((v) & 0xff) << 8) | ((v) >> 8))
++#define CI_BSWAPM_16(v)   ((((v) & 0xff) << 8) | (((v) & 0xff00) >> 8))
++
++#define CI_BSWAP_32(v)    (((v) >> 24)               |        \
++                         (((v) & 0x00ff0000) >> 8) |  \
++                         (((v) & 0x0000ff00) << 8) |  \
++                         ((v) << 24))
++#define CI_BSWAPM_32(v)   ((((v) & 0xff000000) >> 24) |       \
++                         (((v) & 0x00ff0000) >> 8)  | \
++                         (((v) & 0x0000ff00) << 8)  | \
++                         ((v) << 24))
++
++#define CI_BSWAP_64(v)    (((v) >> 56)                        |       \
++                         (((v) & 0x00ff000000000000) >> 40) | \
++                         (((v) & 0x0000ff0000000000) >> 24) | \
++                         (((v) & 0x000000ff00000000) >> 8)  | \
++                         (((v) & 0x00000000ff000000) << 8)  | \
++                         (((v) & 0x0000000000ff0000) << 24) | \
++                         (((v) & 0x000000000000ff00) << 40) | \
++                         ((v) << 56))
++
++# define CI_BSWAPPED_16_IF(c,v)  ((c) ? CI_BSWAP_16(v) : (v))
++# define CI_BSWAPPED_32_IF(c,v)  ((c) ? CI_BSWAP_32(v) : (v))
++# define CI_BSWAPPED_64_IF(c,v)  ((c) ? CI_BSWAP_64(v) : (v))
++# define CI_BSWAP_16_IF(c,v)     do{ if((c)) (v) = CI_BSWAP_16(v); }while(0)
++# define CI_BSWAP_32_IF(c,v)     do{ if((c)) (v) = CI_BSWAP_32(v); }while(0)
++# define CI_BSWAP_64_IF(c,v)     do{ if((c)) (v) = CI_BSWAP_64(v); }while(0)
++
++#if (CI_MY_BYTE_ORDER == CI_LITTLE_ENDIAN)
++# define CI_BSWAP_LE16(v)    (v)
++# define CI_BSWAP_LE32(v)    (v)
++# define CI_BSWAP_LE64(v)    (v)
++# define CI_BSWAP_BE16(v)    CI_BSWAP_16(v)
++# define CI_BSWAP_BE32(v)    CI_BSWAP_32(v)
++# define CI_BSWAP_BE64(v)    CI_BSWAP_64(v)
++# define CI_BSWAPM_LE16(v)   (v)
++# define CI_BSWAPM_LE32(v)   (v)
++# define CI_BSWAPM_LE64(v)   (v)
++# define CI_BSWAPM_BE16(v)   CI_BSWAPM_16(v)
++# define CI_BSWAPM_BE32(v)   CI_BSWAPM_32(v)
++#elif (CI_MY_BYTE_ORDER == CI_BIG_ENDIAN)
++# define CI_BSWAP_BE16(v)    (v)
++# define CI_BSWAP_BE32(v)    (v)
++# define CI_BSWAP_BE64(v)    (v)
++# define CI_BSWAP_LE16(v)    CI_BSWAP_16(v)
++# define CI_BSWAP_LE32(v)    CI_BSWAP_32(v)
++# define CI_BSWAP_LE64(v)    CI_BSWAP_64(v)
++# define CI_BSWAPM_BE16(v)   (v)
++# define CI_BSWAPM_BE32(v)   (v)
++# define CI_BSWAPM_BE64(v)   (v)
++# define CI_BSWAPM_LE16(v)   CI_BSWAPM_16(v)
++# define CI_BSWAPM_LE32(v)   CI_BSWAPM_32(v)
++#else
++# error Bad endian.
++#endif
++
++
++/**********************************************************************
++ * Get pointer to struct from pointer to member
++ **********************************************************************/
++
++#define CI_MEMBER_OFFSET(c_type, mbr_name)  \
++  ((ci_uint32) (ci_uintptr_t)(&((c_type*)0)->mbr_name))
++
++#define CI_MEMBER_SIZE(c_type, mbr_name)        \
++  sizeof(((c_type*)0)->mbr_name)
++
++#define __CI_CONTAINER(c_type, mbr_name, p_mbr)  \
++  ( (c_type*) ((char*)(p_mbr) - CI_MEMBER_OFFSET(c_type, mbr_name)) )
++
++#ifndef CI_CONTAINER
++# define CI_CONTAINER(t,m,p)  __CI_CONTAINER(t,m,p)
++#endif
++
++
++/**********************************************************************
++ * Structure member initialiser.
++ **********************************************************************/
++
++#ifndef CI_STRUCT_MBR
++# define CI_STRUCT_MBR(name, val)     .name = val
++#endif
++
++
++/**********************************************************************
++ * min / max
++ **********************************************************************/ 
++
++#define CI_MIN(x,y) (((x) < (y)) ? (x) : (y))
++#define CI_MAX(x,y) (((x) > (y)) ? (x) : (y))
++
++/**********************************************************************
++ * abs
++ **********************************************************************/ 
++
++#define CI_ABS(x) (((x) < 0) ? -(x) : (x))
++
++/**********************************************************************
++ * Conditional debugging
++ **********************************************************************/ 
++
++#ifdef NDEBUG
++# define CI_DEBUG(x)
++# define CI_NDEBUG(x)      x
++# define CI_IF_DEBUG(y,n)  (n)
++# define CI_DEBUG_ARG(x)
++#else
++# define CI_DEBUG(x)       x
++# define CI_NDEBUG(x)
++# define CI_IF_DEBUG(y,n)  (y)
++# define CI_DEBUG_ARG(x)   ,x
++#endif
++
++#ifdef __KERNEL__
++#define CI_KERNEL_ARG(x)   ,x
++#else
++#define CI_KERNEL_ARG(x)
++#endif
++
++#ifdef _WIN32
++# define CI_KERNEL_ARG_WIN(x) CI_KERNEL_ARG(x)
++# define CI_ARG_WIN(x) ,x
++#else
++# define CI_KERNEL_ARG_WIN(x)
++# define CI_ARG_WIN(x) 
++#endif
++
++#ifdef __unix__
++# define CI_KERNEL_ARG_UNIX(x) CI_KERNEL_ARG(x)
++# define CI_ARG_UNIX(x) ,x
++#else
++# define CI_KERNEL_ARG_UNIX(x)
++# define CI_ARG_UNIX(x) 
++#endif
++
++#ifdef __linux__
++# define CI_KERNEL_ARG_LINUX(x) CI_KERNEL_ARG(x)
++# define CI_ARG_LINUX(x) ,x
++#else
++# define CI_KERNEL_ARG_LINUX(x)
++# define CI_ARG_LINUX(x) 
++#endif
++
++
++#endif  /* __CI_COMPAT_UTILS_H__ */
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/compat/x86.h

index 0000000,0000000..2c1dfb3

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/x86.h
@@@ -1,0 -1,0 +1,48 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*! \cidoxg_include_ci_compat  */
++
++#ifndef __CI_COMPAT_X86_H__
++#define __CI_COMPAT_X86_H__
++
++
++#define CI_MY_BYTE_ORDER   CI_LITTLE_ENDIAN
++
++#define CI_WORD_SIZE       4
++#define CI_PTR_SIZE        4
++
++#define CI_PAGE_SIZE       4096
++#define CI_PAGE_SHIFT      12
++#define CI_PAGE_MASK       (~(CI_PAGE_SIZE - 1))
++
++#define CI_CPU_HAS_SSE           1    /* SSE extensions supported */
++#define CI_CPU_HAS_SSE2          0    /* SSE2 extensions supported */
++#define CI_CPU_OOS       0    /* CPU does out of order stores */
++
++
++#endif  /* __CI_COMPAT_X86_H__ */
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/compat/x86_64.h

index 0000000,0000000..c09f540

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/x86_64.h
@@@ -1,0 -1,0 +1,54 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * \author  djr
++ *  \brief  Arch stuff for AMD x86_64.
++ *   \date  2004/08/17
++ */
++
++/*! \cidoxg_include_ci_compat  */
++#ifndef __CI_COMPAT_X86_64_H__
++#define __CI_COMPAT_X86_64_H__
++
++
++#define CI_MY_BYTE_ORDER      CI_LITTLE_ENDIAN
++
++#define CI_WORD_SIZE          8
++#define CI_PTR_SIZE           8
++
++#define CI_PAGE_SIZE          4096
++#define CI_PAGE_SHIFT         12
++#define CI_PAGE_MASK          (~(CI_PAGE_SIZE - 1))
++
++#define CI_CPU_HAS_SSE                1       /* SSE extensions supported */
++
++/* SSE2 disabled while investigating BUG1060 */
++#define CI_CPU_HAS_SSE2               0       /* SSE2 extensions supported */
++#define CI_CPU_OOS            0       /* CPU does out of order stores */
++
++
++#endif  /* __CI_COMPAT_X86_64_H__ */
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/tools/config.h

index 0000000,0000000..fb802f9

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/config.h
@@@ -1,0 -1,0 +1,49 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*! \cidoxg_include_ci_tools */
++
++#ifndef __CI_TOOLS_CONFIG_H__
++#define __CI_TOOLS_CONFIG_H__
++
++
++/**********************************************************************
++ * Debugging.
++ */
++
++#define CI_INCLUDE_ASSERT_VALID           0
++
++/* Set non-zero to allow info about who has allocated what to appear in
++ * /proc/drivers/level5/mem.
++ * However - Note that doing so can lead to segfault when you unload the
++ * driver, and other weirdness.  i.e. I don't think the code for is quite
++ * right (written by Oktet, hacked by gel), but it does work well enough to be
++ * useful.
++ */
++#define CI_MEMLEAK_DEBUG_ALLOC_TABLE    0
++
++
++#endif  /* __CI_TOOLS_CONFIG_H__ */
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/tools/debug.h

index 0000000,0000000..a25c2c4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/debug.h
@@@ -1,0 -1,0 +1,336 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*! \cidoxg_include_ci_tools */
++
++#ifndef __CI_TOOLS_DEBUG_H__
++#define __CI_TOOLS_DEBUG_H__
++
++#define CI_LOG_E(x)       x              /* errors      */
++#define CI_LOG_W(x)       x              /* warnings    */
++#define CI_LOG_I(x)       x              /* information */
++#define CI_LOG_V(x)       x              /* verbose     */
++
++/* Build time asserts. We paste the line number into the type name
++ * so that the macro can be used more than once per file even if the
++ * compiler objects to multiple identical typedefs. Collisions
++ * between use in different header files is still possible. */
++#ifndef CI_BUILD_ASSERT
++#define __CI_BUILD_ASSERT_NAME(_x) __CI_BUILD_ASSERT_ILOATHECPP(_x)
++#define __CI_BUILD_ASSERT_ILOATHECPP(_x)  __CI_BUILD_ASSERT__ ##_x
++#define CI_BUILD_ASSERT(e)\
++ typedef char  __CI_BUILD_ASSERT_NAME(__LINE__)[(e)?1:-1]
++#endif
++
++
++#ifdef NDEBUG
++
++# define _ci_check(exp, file, line)
++# define _ci_assert2(e, x, y, file, line)
++# define _ci_assert(exp, file, line)
++# define _ci_assert_equal(exp1, exp2, file, line)
++# define _ci_assert_equiv(exp1, exp2, file, line)
++# define _ci_assert_nequal(exp1, exp2, file, line)
++# define _ci_assert_le(exp1, exp2, file, line)
++# define _ci_assert_lt(exp1, exp2, file, line)
++# define _ci_assert_ge(exp1, exp2, file, line)
++# define _ci_assert_gt(exp1, exp2, file, line)
++# define _ci_assert_impl(exp1, exp2, file, line)
++
++# define _ci_verify(exp, file, line) \
++  do { \
++    (void)(exp); \
++  } while (0)
++
++# define CI_DEBUG_TRY(exp) \
++  do { \
++    (void)(exp); \
++  } while (0)
++
++#define CI_TRACE(exp,fmt)
++#define CI_TRACE_INT(integer)
++#define CI_TRACE_INT32(integer)
++#define CI_TRACE_INT64(integer)
++#define CI_TRACE_UINT(integer)
++#define CI_TRACE_UINT32(integer)
++#define CI_TRACE_UINT64(integer)
++#define CI_TRACE_HEX(integer)
++#define CI_TRACE_HEX32(integer)
++#define CI_TRACE_HEX64(integer)
++#define CI_TRACE_PTR(pointer)
++#define CI_TRACE_STRING(string)
++#define CI_TRACE_MAC(mac)
++#define CI_TRACE_IP(ip_be32)
++#define CI_TRACE_ARP(arp_pkt)
++
++#else
++
++# define _CI_ASSERT_FMT   "\nfrom %s:%d"
++
++# define _ci_check(exp, file, line)                             \
++  do {                                                          \
++    if (CI_UNLIKELY(!(exp)))                                    \
++      ci_warn(("ci_check(%s)"_CI_ASSERT_FMT, #exp,              \
++               (file), (line)));                                \
++  } while (0)
++
++/*
++ * NOTE: ci_fail() emits the file and line where the assert is actually
++ *       coded.
++ */
++
++# define _ci_assert(exp, file, line)                            \
++  do {                                                          \
++    if (CI_UNLIKELY(!(exp)))                                    \
++      ci_fail(("ci_assert(%s)"_CI_ASSERT_FMT, #exp,           \
++               (file), (line)));                                \
++  } while (0)
++
++# define _ci_assert2(e, x, y, file, line)  do {                 \
++    if(CI_UNLIKELY( ! (e) ))                                    \
++      ci_fail(("ci_assert(%s)\nwhere [%s=%"CI_PRIx64"] "        \
++               "[%s=%"CI_PRIx64"]\nat %s:%d\nfrom %s:%d", #e    \
++               , #x, (ci_uint64)(ci_uintptr_t)(x)               \
++               , #y, (ci_uint64)(ci_uintptr_t)(y),              \
++               __FILE__, __LINE__, (file), (line)));            \
++  } while (0)
++
++# define _ci_verify(exp, file, line)                            \
++  do {                                                          \
++    if (CI_UNLIKELY(!(exp)))                                    \
++      ci_fail(("ci_verify(%s)"_CI_ASSERT_FMT, #exp,             \
++               (file), (line)));                                \
++  } while (0)
++
++# define _ci_assert_equal(x, y, f, l)  _ci_assert2((x)==(y), x, y, (f), (l))
++# define _ci_assert_nequal(x, y, f, l) _ci_assert2((x)!=(y), x, y, (f), (l))
++# define _ci_assert_le(x, y, f, l)     _ci_assert2((x)<=(y), x, y, (f), (l))
++# define _ci_assert_lt(x, y, f, l)     _ci_assert2((x)< (y), x, y, (f), (l))
++# define _ci_assert_ge(x, y, f, l)     _ci_assert2((x)>=(y), x, y, (f), (l))
++# define _ci_assert_gt(x, y, f, l)     _ci_assert2((x)> (y), x, y, (f), (l))
++# define _ci_assert_or(x, y, f, l)     _ci_assert2((x)||(y), x, y, (f), (l))
++# define _ci_assert_impl(x, y, f, l)   _ci_assert2(!(x) || (y), x, y, (f), (l))
++# define _ci_assert_equiv(x, y, f, l)  _ci_assert2(!(x)== !(y), x, y, (f), (l))
++
++#define _ci_assert_equal_msg(exp1, exp2, msg, file, line)       \
++  do {                                                          \
++    if (CI_UNLIKELY((exp1)!=(exp2)))                            \
++      ci_fail(("ci_assert_equal_msg(%s == %s) were "            \
++               "(%"CI_PRIx64":%"CI_PRIx64") with msg[%c%c%c%c]" \
++               _CI_ASSERT_FMT, #exp1, #exp2,                    \
++               (ci_uint64)(ci_uintptr_t)(exp1),                 \
++               (ci_uint64)(ci_uintptr_t)(exp2),                 \
++               (((ci_uint32)msg) >> 24) && 0xff,                \
++               (((ci_uint32)msg) >> 16) && 0xff,                \
++               (((ci_uint32)msg) >> 8 ) && 0xff,                \
++               (((ci_uint32)msg)      ) && 0xff,                \
++               (file), (line)));                                \
++  } while (0)
++
++# define CI_DEBUG_TRY(exp)  CI_TRY(exp)
++
++#define CI_TRACE(exp,fmt)                                             \
++  ci_log("%s:%d:%s] " #exp "=" fmt,                                     \
++         __FILE__, __LINE__, __FUNCTION__, (exp))
++
++
++#define CI_TRACE_INT(integer)                                         \
++  ci_log("%s:%d:%s] " #integer "=%d",                                   \
++         __FILE__, __LINE__, __FUNCTION__, (integer))
++
++
++#define CI_TRACE_INT32(integer)                                               \
++  ci_log("%s:%d:%s] " #integer "=%d",                                   \
++         __FILE__, __LINE__, __FUNCTION__, ((ci_int32)integer))
++
++
++#define CI_TRACE_INT64(integer)                                               \
++  ci_log("%s:%d:%s] " #integer "=%lld",                                 \
++         __FILE__, __LINE__, __FUNCTION__, ((ci_int64)integer))
++
++
++#define CI_TRACE_UINT(integer)                                                \
++  ci_log("%s:%d:%s] " #integer "=%ud",                                  \
++         __FILE__, __LINE__, __FUNCTION__, (integer))
++
++
++#define CI_TRACE_UINT32(integer)                                      \
++  ci_log("%s:%d:%s] " #integer "=%ud",                                  \
++         __FILE__, __LINE__, __FUNCTION__, ((ci_uint32)integer))
++
++
++#define CI_TRACE_UINT64(integer)                                      \
++  ci_log("%s:%d:%s] " #integer "=%ulld",                                \
++         __FILE__, __LINE__, __FUNCTION__, ((ci_uint64)integer))
++
++
++#define CI_TRACE_HEX(integer)                                         \
++  ci_log("%s:%d:%s] " #integer "=0x%x",                                 \
++         __FILE__, __LINE__, __FUNCTION__, (integer))
++
++
++#define CI_TRACE_HEX32(integer)                                               \
++  ci_log("%s:%d:%s] " #integer "=0x%x",                                 \
++         __FILE__, __LINE__, __FUNCTION__, ((ci_uint32)integer))
++
++
++#define CI_TRACE_HEX64(integer)                                               \
++  ci_log("%s:%d:%s] " #integer "=0x%llx",                               \
++         __FILE__, __LINE__, __FUNCTION__, ((ci_uint64)integer))
++
++
++#define CI_TRACE_PTR(pointer)                                         \
++  ci_log("%s:%d:%s] " #pointer "=0x%p",                                 \
++         __FILE__, __LINE__, __FUNCTION__, (pointer))
++
++
++#define CI_TRACE_STRING(string)                                               \
++  ci_log("%s:%d:%s] " #string "=%s",                                    \
++         __FILE__, __LINE__, __FUNCTION__, (string))
++
++
++#define CI_TRACE_MAC(mac)                                             \
++  ci_log("%s:%d:%s] " #mac "=" CI_MAC_PRINTF_FORMAT,                    \
++         __FILE__, __LINE__, __FUNCTION__, CI_MAC_PRINTF_ARGS(mac))
++
++
++#define CI_TRACE_IP(ip_be32)                                          \
++  ci_log("%s:%d:%s] " #ip_be32 "=" CI_IP_PRINTF_FORMAT, __FILE__,       \
++         __LINE__, __FUNCTION__, CI_IP_PRINTF_ARGS(&(ip_be32)))
++
++
++#define CI_TRACE_ARP(arp_pkt)                                           \
++  ci_log("%s:%d:%s]\n"CI_ARP_PRINTF_FORMAT,                             \
++         __FILE__, __LINE__, __FUNCTION__, CI_ARP_PRINTF_ARGS(arp_pkt))
++
++#endif  /* NDEBUG */
++
++#define ci_check(exp) \
++        _ci_check(exp, __FILE__, __LINE__)
++
++#define ci_assert(exp) \
++        _ci_assert(exp, __FILE__, __LINE__)
++
++#define ci_verify(exp) \
++        _ci_verify(exp, __FILE__, __LINE__)
++
++#define ci_assert_equal(exp1, exp2) \
++        _ci_assert_equal(exp1, exp2, __FILE__, __LINE__)
++
++#define ci_assert_equal_msg(exp1, exp2, msg) \
++        _ci_assert_equal_msg(exp1, exp2, msg, __FILE__, __LINE__)
++
++#define ci_assert_nequal(exp1, exp2) \
++        _ci_assert_nequal(exp1, exp2, __FILE__, __LINE__)
++
++#define ci_assert_le(exp1, exp2) \
++        _ci_assert_le(exp1, exp2, __FILE__, __LINE__)
++
++#define ci_assert_lt(exp1, exp2) \
++        _ci_assert_lt(exp1, exp2, __FILE__, __LINE__)
++
++#define ci_assert_ge(exp1, exp2) \
++        _ci_assert_ge(exp1, exp2, __FILE__, __LINE__)
++
++#define ci_assert_gt(exp1, exp2) \
++        _ci_assert_gt(exp1, exp2, __FILE__, __LINE__)
++
++#define ci_assert_impl(exp1, exp2) \
++        _ci_assert_impl(exp1, exp2, __FILE__, __LINE__)
++
++#define ci_assert_equiv(exp1, exp2) \
++        _ci_assert_equiv(exp1, exp2, __FILE__, __LINE__)
++
++
++#define CI_TEST(exp)                            \
++  do{                                           \
++    if( CI_UNLIKELY(!(exp)) )                   \
++      ci_fail(("CI_TEST(%s)", #exp));           \
++  }while(0)
++
++
++#define CI_TRY(exp)                           \
++  do{                                         \
++    int _trc;                                 \
++    _trc=(exp);                                       \
++    if( CI_UNLIKELY(_trc < 0) )                       \
++      ci_sys_fail(#exp, _trc);                        \
++  }while(0)
++
++
++#define CI_TRY_RET(exp)                                                        \
++  do{                                                                  \
++    int _trc;                                                          \
++    _trc=(exp);                                                                \
++    if( CI_UNLIKELY(_trc < 0) ) {                                      \
++      ci_log("%s returned %d at %s:%d", #exp, _trc, __FILE__, __LINE__); \
++      return _trc;                                                     \
++    }                                                                  \
++  }while(0)
++
++#define CI_LOGLEVEL_TRY_RET(logfn, exp)                                    \
++  do{                                                                  \
++    int _trc;                                                          \
++    _trc=(exp);                                                                \
++    if( CI_UNLIKELY(_trc < 0) ) {                                      \
++      logfn (ci_log("%s returned %d at %s:%d", #exp, _trc, __FILE__, __LINE__)); \
++      return _trc;                                                     \
++    }                                                                  \
++  }while(0)
++
++
++#define CI_SOCK_TRY(exp)                      \
++  do{                                         \
++    ci_sock_err_t _trc;                               \
++    _trc=(exp);                                       \
++    if( CI_UNLIKELY(!ci_sock_errok(_trc)) )   \
++      ci_sys_fail(#exp, _trc.val);            \
++  }while(0)
++
++
++#define CI_SOCK_TRY_RET(exp)                                               \
++  do{                                                                      \
++    ci_sock_err_t _trc;                                                            \
++    _trc=(exp);                                                                    \
++    if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) {                              \
++      ci_log("%s returned %d at %s:%d", #exp, _trc.val, __FILE__, __LINE__); \
++      return ci_sock_errcode(_trc);                                        \
++    }                                                                      \
++  }while(0)
++
++
++#define CI_SOCK_TRY_SOCK_RET(exp)                                          \
++  do{                                                                      \
++    ci_sock_err_t _trc;                                                            \
++    _trc=(exp);                                                                    \
++    if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) {                              \
++      ci_log("%s returned %d at %s:%d", #exp, _trc.val, __FILE__, __LINE__); \
++      return _trc;                                                         \
++    }                                                                      \
++  }while(0)
++
++#endif  /* __CI_TOOLS_DEBUG_H__ */
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/tools/log.h

index 0000000,0000000..a9d471d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/log.h
@@@ -1,0 -1,0 +1,269 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * \author  djr
++ *  \brief  Functions for logging and pretty-printing.
++ *   \date  2002/08/07
++ */
++
++/*! \cidoxg_include_ci_tools */
++
++#ifndef __CI_TOOLS_LOG_H__
++#define __CI_TOOLS_LOG_H__
++
++#include <stdarg.h>
++
++
++/**********************************************************************
++ * Logging.
++ */
++
++/* size of internal log buffer */ 
++#define  CI_LOG_MAX_LINE        512
++/* uses of ci_log must ensure that all trace messages are shorter than this */ 
++#define  CI_LOG_MAX_MSG_LENGTH        (CI_LOG_MAX_LINE-50)
++
++extern void ci_vlog(const char* fmt, va_list args)  CI_HF;
++extern void ci_log(const char* fmt, ...) CI_PRINTF_LIKE(1,2) CI_HF;
++
++  /*! Set the prefix for log messages.
++  **
++  ** Uses the storage pointed to by \em prefix.  Therefore \em prefix must
++  ** be allocated on the heap, or statically.
++  */
++extern void ci_set_log_prefix(const char* prefix)  CI_HF;
++
++typedef void (*ci_log_fn_t)(const char* msg);
++extern ci_log_fn_t  ci_log_fn  CI_HV;
++
++/* Log functions. */
++extern void ci_log_null(const char* msg) CI_HF;
++extern void ci_log_stderr(const char* msg) CI_HF;
++extern void ci_log_stdout(const char* msg) CI_HF;
++extern void ci_log_syslog(const char* msg) CI_HF;
++
++/*! Call the following to install special logging behaviours. */
++extern void ci_log_buffer_till_fail(void) CI_HF;
++extern void ci_log_buffer_till_exit(void) CI_HF;
++
++extern void __ci_log_unique(const char* msg) CI_HF;
++extern ci_log_fn_t __ci_log_unique_fn CI_HV;
++ci_inline void ci_log_uniquify(void) {
++  if( ci_log_fn != __ci_log_unique ) {
++    __ci_log_unique_fn = ci_log_fn;
++    ci_log_fn = __ci_log_unique;
++  }
++}
++
++extern void ci_log_file(const char* msg) CI_HF;
++extern int  ci_log_file_fd CI_HV;
++
++extern void __ci_log_nth(const char* msg) CI_HF;
++extern ci_log_fn_t __ci_log_nth_fn CI_HV;
++extern int  ci_log_nth_n CI_HV;  /* default 100 */
++ci_inline void ci_log_nth(void) {
++  if( ci_log_fn != __ci_log_nth ) {
++    __ci_log_nth_fn = ci_log_fn;
++    ci_log_fn = __ci_log_nth;
++  }
++}
++
++extern int  ci_log_level  CI_HV;
++
++extern int  ci_log_options  CI_HV;
++#define CI_LOG_PID            0x1
++#define CI_LOG_TID            0x2
++#define CI_LOG_TIME           0x4
++#define CI_LOG_DELTA          0x8
++
++/**********************************************************************
++ * Used to define which mode we are in
++ */
++#if (defined(_WIN32) && !defined(__KERNEL__))
++typedef enum {
++  ci_log_md_NULL=0,
++    ci_log_md_ioctl,
++    ci_log_md_stderr,
++    ci_log_md_stdout,
++    ci_log_md_file,
++    ci_log_md_serial,
++    ci_log_md_syslog,
++    ci_log_md_pidfile
++} ci_log_mode_t;
++extern ci_log_mode_t ci_log_mode;
++#endif
++
++/**********************************************************************
++ * Pretty-printing.
++ */
++
++extern char ci_printable_char(char c) CI_HF;
++
++extern void (*ci_hex_dump_formatter)(char* buf, const ci_octet* s,
++                                   int i, int off, int len) CI_HV;
++extern void ci_hex_dump_format_octets(char*,const ci_octet*,int,int,int) CI_HF;
++extern void ci_hex_dump_format_dwords(char*,const ci_octet*,int,int,int) CI_HF;
++
++extern void ci_hex_dump_row(char* buf, volatile const void* s, int len,
++                          ci_ptr_arith_t address) CI_HF;
++  /*!< A row contains up to 16 bytes.  Row starts at [address & 15u], so
++  ** therefore [len + (address & 15u)] must be <= 16.
++  */
++
++extern void ci_hex_dump(ci_log_fn_t, volatile const void*,
++                      int len, ci_ptr_arith_t address) CI_HF;
++
++extern int  ci_hex_dump_to_raw(const char* src_hex, void* buf,
++                             unsigned* addr_out_opt, int* skip)  CI_HF;
++  /*!< Recovers raw data from a single line of a hex dump.  [buf] must be at
++  ** least 16 bytes long.  Returns the number of bytes written to [buf] (in
++  ** range 1 -> 16), or -1 if [src_hex] doesn't contain hex data.  Does not
++  ** cope with missing bytes at the start of a line.
++  */
++
++extern int ci_format_eth_addr(char* buf, const void* eth_mac_addr,
++                            char sep)  CI_HF;
++  /*!< This will write 18 characters to <buf> including terminating null.
++  ** Returns number of bytes written excluding null.  If [sep] is zero, ':'
++  ** is used.
++  */
++
++extern int ci_parse_eth_addr(void* eth_mac_addr,
++                           const char* str, char sep) CI_HF;
++  /*!< If [sep] is zero, absolutely any separator is accepted (even
++  ** inconsistent separators).  Returns 0 on success, -1 on error.
++  */
++
++extern int ci_format_ip4_addr(char* buf, unsigned addr_be32) CI_HF;
++  /*!< Formats the IP address (in network endian) in dotted-quad.  Returns
++  ** the number of bytes written (up to 15), excluding the null.  [buf]
++  ** must be at least 16 bytes long.
++  */
++
++#if defined(__unix__) && ! defined(__KERNEL__)
++extern int ci_format_select_set(char* s, int len_s, int nfds, const fd_set*);
++extern int ci_format_select(char* s, int len_s,
++                          int nfds, const fd_set* rds, const fd_set* wrs,
++                          const fd_set* exs, struct timeval* timeout);
++#endif
++
++
++/**********************************************************************
++ * Error checking.
++ */
++
++extern void (*ci_fail_stop_fn)(void) CI_HV;
++
++extern void ci_fail_stop(void) CI_HF;
++extern void ci_fail_hang(void) CI_HF;
++extern void ci_fail_bomb(void) CI_HF;
++extern void ci_backtrace(void) CI_HF;
++
++#if defined __linux__ && !defined __KERNEL__
++extern void ci_fail_abort (void) CI_HF;
++#endif
++
++#ifdef __GNUC__
++extern void
++__ci_fail(const char*, ...) CI_PRINTF_LIKE(1,2) CI_HF;
++#else
++# if _PREFAST_
++  extern void _declspec(noreturn) __ci_fail(const char* fmt, ...);
++# else 
++  extern void __ci_fail(const char* fmt, ...);
++# endif
++
++#endif
++
++#define ci_warn(x)                                                       \
++  do{ ci_log("WARN at %s:%d", __FILE__, __LINE__); }while(0)
++
++#define ci_fail(x)                                                       \
++  do{ ci_log("FAIL at %s:%d", __FILE__, __LINE__);  __ci_fail x; }while(0)
++
++extern void __ci_sys_fail(const char* fn, int rc,
++                        const char* file, int line) CI_HF;
++#define ci_sys_fail(fn, rc)  __ci_sys_fail(fn, rc, __FILE__, __LINE__)
++
++/**********************************************************************
++ * Logging to buffer (src/citools/log_buffer.c)
++ */
++
++/*! Divert ci_log() messages to the log buffer
++ *  normally they go to the  system console */
++extern void ci_log_buffer_till_fail(void) CI_HF;
++
++/*! Dump the contents of the log buffer to the system console */
++extern void ci_log_buffer_dump(void) CI_HF;
++
++
++/**********************************************************************
++ * Some useful pretty-printing.
++ */
++
++#ifdef  __linux__
++# define CI_SOCKCALL_FLAGS_FMT        "%s%s%s%s%s%s%s%s%s%s%s"
++
++# define CI_SOCKCALL_FLAGS_PRI_ARG(x)         \
++  (((x) & MSG_OOB         ) ? "OOB "         :""),    \
++  (((x) & MSG_PEEK        ) ? "PEEK "        :""),    \
++  (((x) & MSG_DONTROUTE   ) ? "DONTROUTE "   :""),    \
++  (((x) & MSG_EOR         ) ? "EOR "         :""),    \
++  (((x) & MSG_CTRUNC      ) ? "CTRUNC "      :""),    \
++  (((x) & MSG_TRUNC       ) ? "TRUNC "       :""),    \
++  (((x) & MSG_WAITALL     ) ? "WAITALL "     :""),    \
++  (((x) & MSG_DONTWAIT    ) ? "DONTWAIT "    :""),    \
++  (((x) & MSG_NOSIGNAL    ) ? "NOSIGNAL "    :""),    \
++  (((x) & MSG_ERRQUEUE    ) ? "ERRQUEUE "    :""),    \
++  (((x) & MSG_CONFIRM     ) ? "CONFIRM "     :"")
++#endif
++
++#ifdef  _WIN32
++# define CI_SOCKCALL_FLAGS_FMT        "%s%s%s"
++
++# define CI_SOCKCALL_FLAGS_PRI_ARG(x)         \
++  (((x) & MSG_OOB         ) ? "OOB "         :""),    \
++  (((x) & MSG_PEEK        ) ? "PEEK "        :""),    \
++  (((x) & MSG_DONTROUTE   ) ? "DONTROUTE "   :"")
++#endif
++
++#ifdef  __sun__
++# define CI_SOCKCALL_FLAGS_FMT        "%s%s%s%s%s%s%s%s%s"
++
++# define CI_SOCKCALL_FLAGS_PRI_ARG(x)         \
++  (((x) & MSG_OOB         ) ? "OOB "         :""),    \
++  (((x) & MSG_PEEK        ) ? "PEEK "        :""),    \
++  (((x) & MSG_DONTROUTE   ) ? "DONTROUTE "   :""),    \
++  (((x) & MSG_EOR         ) ? "EOR "         :""),    \
++  (((x) & MSG_CTRUNC      ) ? "CTRUNC "      :""),    \
++  (((x) & MSG_TRUNC       ) ? "TRUNC "       :""),    \
++  (((x) & MSG_WAITALL     ) ? "WAITALL "     :""),    \
++  (((x) & MSG_DONTWAIT    ) ? "DONTWAIT "    :""),    \
++  (((x) & MSG_NOTIFICATION) ? "NOTIFICATION" :"")
++#endif
++
++#endif  /* __CI_TOOLS_LOG_H__ */
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h

index 0000000,0000000..33af3f1

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h
@@@ -1,0 -1,0 +1,370 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*! \cidoxg_include_ci_tools_platform  */
++
++#ifndef __CI_TOOLS_GCC_X86_H__
++#define __CI_TOOLS_GCC_X86_H__
++
++
++/**********************************************************************
++ * Free-running cycle counters.
++ */
++
++#define CI_HAVE_FRC64
++#define CI_HAVE_FRC32
++
++#define ci_frc32(pval)  __asm__ __volatile__("rdtsc" : "=a" (*pval) : : "edx")
++
++#if defined(__x86_64__)
++ci_inline void ci_frc64(ci_uint64* pval) {
++  /* temp fix until we figure how to get this out in one bite */         
++  ci_uint64 low, high;
++  __asm__ __volatile__("rdtsc" : "=a" (low) , "=d" (high));           
++  *pval = (high << 32) | low;
++}
++
++#else
++#define ci_frc64(pval)  __asm__ __volatile__("rdtsc" : "=A" (*pval))
++#endif
++
++#define ci_frc_flush()  /* ?? Need a pipeline barrier. */
++
++
++/**********************************************************************
++ * Atomic integer.
++ */
++
++/*
++** int  ci_atomic_read(a)         { return a->n;        }
++** void ci_atomic_set(a, v)       { a->n = v;           }
++** void ci_atomic_inc(a)          { ++a->n;             }
++** void ci_atomic_dec(a)          { --a->n;             }
++** int  ci_atomic_inc_and_test(a) { return ++a->n == 0; }
++** int  ci_atomic_dec_and_test(a) { return --a->n == 0; }
++** void ci_atomic_and(a, v)       { a->n &= v;          }
++** void ci_atomic_or(a, v)        { a->n |= v;          }
++*/
++
++typedef struct { volatile ci_int32 n; } ci_atomic_t;
++
++#define CI_ATOMIC_INITIALISER(i)  {(i)}
++
++static inline ci_int32  ci_atomic_read(const ci_atomic_t* a) { return a->n; }
++static inline void ci_atomic_set(ci_atomic_t* a, int v) { a->n = v; ci_wmb();   }
++
++static inline void ci_atomic_inc(ci_atomic_t* a)
++{ __asm__ __volatile__("lock; incl %0" : "+m" (a->n)); }
++
++ 
++static inline void ci_atomic_dec(ci_atomic_t* a)
++{ __asm__ __volatile__("lock; decl %0" : "+m" (a->n)); }
++
++static inline int ci_atomic_inc_and_test(ci_atomic_t* a) {
++  char r;
++  __asm__ __volatile__("lock; incl %0; sete %1"
++                     : "+m" (a->n), "=qm" (r));
++  return r;
++}
++
++static inline int ci_atomic_dec_and_test(ci_atomic_t* a) {
++  char r;
++  __asm__ __volatile__("lock; decl %0; sete %1"
++                     : "+m" (a->n), "=qm" (r));
++  return r;
++}
++
++ci_inline int
++ci_atomic_xadd (ci_atomic_t *a, int v) {
++   __asm__ ("lock xadd %0, %1" : "=r" (v), "+m" (a->n) : "0" (v));
++  return v;
++}
++ci_inline int
++ci_atomic_xchg (ci_atomic_t *a, int v) {
++   __asm__ ("lock xchg %0, %1" : "=r" (v), "+m" (a->n) : "0" (v));
++  return v;
++}
++
++ci_inline void ci_atomic32_or(volatile ci_uint32* p, ci_uint32 mask)
++{ __asm__ __volatile__("lock; orl %1, %0" : "+m" (*p) : "ir" (mask)); }
++
++ci_inline void ci_atomic32_and(volatile ci_uint32* p, ci_uint32 mask)
++{ __asm__ __volatile__("lock; andl %1, %0" : "+m" (*p) : "ir" (mask)); }
++
++ci_inline void ci_atomic32_add(volatile ci_uint32* p, ci_uint32 v)
++{ __asm__ __volatile__("lock; addl %1, %0" : "+m" (*p) : "ir" (v)); }
++
++ci_inline void ci_atomic32_inc(volatile ci_uint32* p)
++{ __asm__ __volatile__("lock; incl %0" : "+m" (*p)); }
++
++ci_inline int ci_atomic32_dec_and_test(volatile ci_uint32* p) {
++  char r;
++  __asm__ __volatile__("lock; decl %0; sete %1" : "+m" (*p), "=qm" (r));
++  return r;
++}
++
++#define ci_atomic_or(a, v)   ci_atomic32_or ((ci_uint32*) &(a)->n, (v))
++#define ci_atomic_and(a, v)  ci_atomic32_and((ci_uint32*) &(a)->n, (v))
++#define ci_atomic_add(a, v)  ci_atomic32_add((ci_uint32*) &(a)->n, (v))
++
++extern int ci_glibc_uses_nptl (void) CI_HF;
++extern int ci_glibc_nptl_broken(void) CI_HF;
++extern int ci_glibc_gs_get_is_multihreaded_offset (void) CI_HF;
++extern int ci_glibc_gs_is_multihreaded_offset CI_HV;
++
++#if !defined(__x86_64__)
++#ifdef __GLIBC__
++/* Returns non-zero if the calling process might be mulithreaded, returns 0 if
++ * it definitely isn't (i.e. if reimplementing this function for other
++ * architectures and platforms, you can safely just return 1).
++ */
++static inline int ci_is_multithreaded (void) {
++
++  while (1) {
++    if (ci_glibc_gs_is_multihreaded_offset >= 0) {
++      /* NPTL keeps a variable that tells us this hanging off gs (i.e. in thread-
++       * local storage); just return this
++       */
++      int r;
++      __asm__ __volatile__ ("movl %%gs:(%1), %0"
++                            : "=r" (r)
++                            : "r" (ci_glibc_gs_is_multihreaded_offset));
++      return r;
++    }
++
++    if (ci_glibc_gs_is_multihreaded_offset == -2) {
++      /* This means we've already determined that the libc version is NOT good
++       * for our funky "is multithreaded" hack
++       */
++      return 1;
++    }
++
++    /* If we get here, it means this is the first time the function has been
++     * called -- detect the libc version and go around again.
++     */
++    ci_glibc_gs_is_multihreaded_offset = ci_glibc_gs_get_is_multihreaded_offset ();
++
++    /* Go around again.  We do the test here rather than at the top so that we go
++     * quicker in the common the case
++     */
++  }
++}
++
++#else    /* def __GLIBC__ */
++
++#define ci_is_multithreaded() 1 /* ?? Is the the POSIX way of finding out */
++                                /*    whether the appication is single */
++                                /*    threaded? */
++
++#endif   /* def __GLIBC__ */
++
++#else    /* defined __x86_64__ */
++
++static inline int ci_is_multithreaded (void) {
++  /* Now easy way to tell on x86_64; so assume we're multithreaded */
++  return 1;
++}
++
++#endif    /* defined __x86_64__ */
++
++
++/**********************************************************************
++ * Compare and swap.
++ */
++
++#define CI_HAVE_COMPARE_AND_SWAP
++
++ci_inline int ci_cas32_succeed(volatile ci_int32* p, ci_int32 oldval,
++                               ci_int32 newval) {
++  char ret;
++  ci_int32 prevval;
++  __asm__ __volatile__("lock; cmpxchgl %3, %1; sete %0"
++                     : "=q"(ret), "+m"(*p), "=a"(prevval)
++                     : "r"(newval), "a"(oldval));
++  return ret;
++}
++
++ci_inline int ci_cas32_fail(volatile ci_int32* p, ci_int32 oldval,
++                            ci_int32 newval) {
++  char ret;
++  ci_int32 prevval;
++  __asm__ __volatile__("lock; cmpxchgl %3, %1; setne %0"
++                     : "=q"(ret), "+m"(*p), "=a"(prevval)
++                     : "r"(newval), "a"(oldval));
++  return ret;
++}
++
++#ifdef __x86_64__
++ci_inline int ci_cas64_succeed(volatile ci_int64* p, ci_int64 oldval,
++                             ci_int64 newval) {
++  char ret;
++  ci_int64 prevval;
++  __asm__ __volatile__("lock; cmpxchgq %3, %1; sete %0"
++                     : "=q"(ret), "+m"(*p), "=a"(prevval)
++                     : "r"(newval), "a"(oldval));
++  return ret;
++}
++
++ci_inline int ci_cas64_fail(volatile ci_int64* p, ci_int64 oldval,
++                          ci_int64 newval) {
++  char ret;
++  ci_int64 prevval;
++  __asm__ __volatile__("lock; cmpxchgq %3, %1; setne %0"
++                     : "=q"(ret), "+m"(*p), "=a"(prevval)
++                     : "r"(newval), "a"(oldval));
++  return ret;
++}
++#endif
++
++ci_inline int ci_cas32u_succeed(volatile ci_uint32* p, ci_uint32 oldval, ci_uint32 newval) {
++  char ret;
++  ci_uint32 prevval;
++  __asm__ __volatile__("lock; cmpxchgl %3, %1; sete %0"
++                     : "=q"(ret), "+m"(*p), "=a"(prevval)
++                     : "r"(newval), "a"(oldval));
++  return ret;
++}
++
++ci_inline int ci_cas32u_fail(volatile ci_uint32* p, ci_uint32 oldval, ci_uint32 newval) {
++  char ret;
++  ci_uint32 prevval;
++  __asm__ __volatile__("lock; cmpxchgl %3, %1; setne %0"
++                     : "=q"(ret), "+m"(*p), "=a"(prevval)
++                     : "r"(newval), "a"(oldval));
++  return ret;
++}
++
++ci_inline int ci_cas64u_succeed(volatile ci_uint64* p, ci_uint64 oldval,
++                             ci_uint64 newval) {
++  char ret;
++  ci_uint64 prevval;
++  __asm__ __volatile__("lock; cmpxchgq %3, %1; sete %0"
++                     : "=q"(ret), "+m"(*p), "=a"(prevval)
++                     : "r"(newval), "a"(oldval));
++  return ret;
++}
++
++ci_inline int ci_cas64u_fail(volatile ci_uint64* p, ci_uint64 oldval,
++                          ci_uint64 newval) {
++  char ret;
++  ci_uint64 prevval;
++  __asm__ __volatile__("lock; cmpxchgq %3, %1; setne %0"
++                     : "=q"(ret), "+m"(*p), "=a"(prevval)
++                     : "r"(newval), "a"(oldval));
++  return ret;
++}
++
++#ifdef __x86_64__
++
++# define ci_cas_uintptr_succeed(p,o,n)                                \
++    ci_cas64u_succeed((volatile ci_uint64*) (p), (o), (n))
++# define ci_cas_uintptr_fail(p,o,n)                           \
++    ci_cas64u_fail((volatile ci_uint64*) (p), (o), (n))
++
++#else
++
++# define ci_cas_uintptr_succeed(p,o,n)                                \
++    ci_cas32u_succeed((volatile ci_uint32*) (p), (o), (n))
++# define ci_cas_uintptr_fail(p,o,n)                           \
++    ci_cas32u_fail((volatile ci_uint32*) (p), (o), (n))
++
++#endif
++
++
++/**********************************************************************
++ * Atomic bit field.
++ */
++
++typedef ci_uint32  ci_bits;
++#define CI_BITS_N                     32u
++
++#define CI_BITS_DECLARE(name, n)                      \
++  ci_bits name[((n) + CI_BITS_N - 1u) / CI_BITS_N]
++
++ci_inline void ci_bits_clear_all(volatile ci_bits* b, int n_bits)
++{ memset((void*) b, 0, (n_bits+CI_BITS_N-1u) / CI_BITS_N * sizeof(ci_bits)); }
++
++ci_inline void ci_bit_set(volatile ci_bits* b, int i) {
++  __asm__ __volatile__("lock; btsl %1, %0"
++                     : "=m" (*b)
++                     : "Ir" (i));
++}
++
++ci_inline void ci_bit_clear(volatile ci_bits* b, int i) {
++  __asm__ __volatile__("lock; btrl %1, %0"
++                     : "=m" (*b)
++                     : "Ir" (i));
++}
++
++ci_inline int  ci_bit_test(volatile ci_bits* b, int i) {
++  char rc;
++  __asm__("btl %2, %1; setc %0"
++        : "=r" (rc)
++        : "m" (*b), "Ir" (i));
++  return rc;
++}
++
++ci_inline int ci_bit_test_and_set(volatile ci_bits* b, int i) {
++  char rc;
++  __asm__ __volatile__("lock; btsl %2, %1; setc %0"
++                     : "=r" (rc), "+m" (*b)
++                     : "Ir" (i));
++  return rc;
++}
++
++ci_inline int ci_bit_test_and_clear(volatile ci_bits* b, int i) {
++  char rc;
++  __asm__ __volatile__("lock; btrl %2, %1; setc %0"
++                     : "=r" (rc), "+m" (*b)
++                     : "Ir" (i));
++  return rc;
++}
++
++/* These mask ops only work within a single ci_bits word. */
++#define ci_bit_mask_set(b,m)  ci_atomic32_or((b), (m))
++#define ci_bit_mask_clear(b,m)        ci_atomic32_and((b), ~(m))
++
++
++/**********************************************************************
++ * Misc.
++ */
++
++#if __GNUC__ >= 3
++# define ci_spinloop_pause()  __asm__("pause") 
++#else
++# define ci_spinloop_pause()  __asm__(".byte 0xf3, 0x90")
++#endif
++
++
++#define CI_HAVE_ADDC32
++#define ci_add_carry32(sum, v)  __asm__("addl %1, %0 ;"                         \
++                                      "adcl $0, %0 ;"                   \
++                                      : "=r" (sum)                      \
++                                      : "g" ((ci_uint32) v), "0" (sum))
++
++
++#endif  /* __CI_TOOLS_GCC_X86_H__ */
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h

index 0000000,0000000..e0870b6

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h
@@@ -1,0 -1,0 +1,361 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++
++/*! \cidoxg_include_ci_tools_platform  */
++
++#ifndef __CI_TOOLS_LINUX_KERNEL_H__
++#define __CI_TOOLS_LINUX_KERNEL_H__
++
++/**********************************************************************
++ * Need to know the kernel version.
++ */
++
++#ifndef LINUX_VERSION_CODE
++# include <linux/version.h>
++# ifndef UTS_RELEASE
++   /* 2.6.18 onwards defines UTS_RELEASE in a separate header */
++#  include <linux/utsrelease.h>
++# endif
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) || \
++    LINUX_VERSION_CODE >= KERNEL_VERSION(2,7,0)
++# error "Linux 2.6 required"
++#endif
++
++
++#include <linux/slab.h>     /* kmalloc / kfree */
++#include <linux/vmalloc.h>  /* vmalloc / vfree */
++#include <linux/interrupt.h>/* in_interrupt()  */
++#include <linux/in.h>
++#include <linux/in6.h>
++#include <linux/spinlock.h>
++#include <linux/highmem.h>
++#include <linux/ctype.h>
++#include <linux/uio.h>
++#include <asm/current.h>
++#include <asm/errno.h>
++#include <asm/kmap_types.h>
++#include <asm/semaphore.h>
++
++#include <ci/tools/config.h>
++
++#define ci_in_irq        in_irq
++#define ci_in_interrupt  in_interrupt
++#define ci_in_atomic     in_atomic
++
++
++/**********************************************************************
++ * Misc stuff.
++ */
++
++#ifdef BUG
++# define  CI_BOMB     BUG
++#endif
++
++ci_inline void* __ci_alloc(size_t n)
++{ return kmalloc(n, (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)); }
++
++ci_inline void* __ci_atomic_alloc(size_t n)
++{ return kmalloc(n, GFP_ATOMIC ); }
++
++ci_inline void  __ci_free(void* p)     { return kfree(p);   }
++ci_inline void* __ci_vmalloc(size_t n) { return vmalloc(n); }
++ci_inline void  __ci_vfree(void* p)    { return vfree(p);   }
++
++
++#if CI_MEMLEAK_DEBUG_ALLOC_TABLE
++  #define ci_alloc(s)     ci_alloc_memleak_debug (s, __FILE__, __LINE__)
++  #define ci_atomic_alloc(s)  ci_atomic_alloc_memleak_debug(s, __FILE__, __LINE__)
++  #define ci_free         ci_free_memleak_debug
++  #define ci_vmalloc(s)   ci_vmalloc_memleak_debug (s, __FILE__,__LINE__)
++  #define ci_vfree        ci_vfree_memleak_debug
++  #define ci_alloc_fn     ci_alloc_fn_memleak_debug
++  #define ci_vmalloc_fn   ci_vmalloc_fn_memleak_debug
++#else /* !CI_MEMLEAK_DEBUG_ALLOC_TABLE */
++  #define ci_alloc_fn     __ci_alloc
++  #define ci_vmalloc_fn   __ci_vmalloc
++#endif 
++
++#ifndef ci_alloc
++  #define ci_atomic_alloc __ci_atomic_alloc
++  #define ci_alloc        __ci_alloc
++  #define ci_free         __ci_free
++  #define ci_vmalloc      __ci_vmalloc
++  #define ci_vmalloc_fn   __ci_vmalloc
++  #define ci_vfree        __ci_vfree
++#endif
++
++#define ci_sprintf        sprintf
++#define ci_vsprintf       vsprintf
++#define ci_snprintf       snprintf
++#define ci_vsnprintf      vsnprintf
++#define ci_sscanf         sscanf
++
++
++#define CI_LOG_FN_DEFAULT  ci_log_syslog
++
++
++/*--------------------------------------------------------------------
++ *
++ * irqs_disabled - needed for kmap helpers on some kernels 
++ *
++ *--------------------------------------------------------------------*/
++#ifdef irqs_disabled
++# define ci_irqs_disabled irqs_disabled
++#else
++# if defined(__i386__) | defined(__x86_64__)
++#   define ci_irqs_disabled(x)                  \
++  ({                                            \
++    unsigned long flags;                        \
++    local_save_flags(flags);                    \
++    !(flags & (1<<9));                          \
++  })
++# else
++#  error "Need to implement irqs_disabled() for your architecture"
++# endif
++#endif
++
++
++/**********************************************************************
++ * kmap helpers. 
++ *
++ * Use ci_k(un)map for code paths which are not in an atomic context.
++ * For atomic code you need to use ci_k(un)map_in_atomic. This will grab
++ * one of the per-CPU kmap slots.
++ *
++ * NB in_interrupt != in_irq. If you don't know the difference then
++ * don't use kmap_in_atomic
++ *
++ * 2.4 allocates kmap slots by function. We are going to re-use the
++ * skb module's slot - we also use the same interlock
++ * 
++ * 2.6 allocates kmap slots by type as well as by function. We are
++ * going to use the currently (2.6.10) unsused SOFTIRQ slot 
++ *
++ */
++
++ci_inline void* ci_kmap(struct page *page) {
++  CI_DEBUG(if( ci_in_atomic() | ci_in_interrupt() | ci_in_irq() )  BUG());
++  return kmap(page);
++}
++
++ci_inline void ci_kunmap(struct page *page) {
++  kunmap(page);
++}
++
++#define CI_KM_SLOT KM_SOFTIRQ0
++
++
++typedef struct semaphore ci_semaphore_t;
++
++ci_inline void
++ci_sem_init (ci_semaphore_t *sem, int val) {
++  sema_init (sem, val);
++}
++
++ci_inline void
++ci_sem_down (ci_semaphore_t *sem) {
++  down (sem);
++}
++
++ci_inline int
++ci_sem_trydown (ci_semaphore_t *sem) {
++  return down_trylock (sem);
++}
++
++ci_inline void
++ci_sem_up (ci_semaphore_t *sem) {
++  up (sem);
++}
++
++ci_inline int
++ci_sem_get_count(ci_semaphore_t *sem) {
++  return sem->count.counter;
++}
++
++ci_inline void* ci_kmap_in_atomic(struct page *page) 
++{
++  CI_DEBUG(if( ci_in_irq() )  BUG());
++
++  /* iSCSI can call without in_interrupt() but with irqs_disabled()
++     and in a context that can't sleep, so we need to check that
++     too */
++  if(ci_in_interrupt() || ci_irqs_disabled())
++    return kmap_atomic(page, CI_KM_SLOT);
++  else
++    return kmap(page);
++}
++
++ci_inline void ci_kunmap_in_atomic(struct page *page, void* kaddr) 
++{
++  CI_DEBUG(if( ci_in_irq() )  BUG());
++
++  /* iSCSI can call without in_interrupt() but with irqs_disabled()
++     and in a context that can't sleep, so we need to check that
++     too */
++  if(ci_in_interrupt() || ci_irqs_disabled())
++    kunmap_atomic(kaddr, CI_KM_SLOT);
++  else
++    kunmap(page);
++}
++
++/**********************************************************************
++ * spinlock implementation: used by <ci/tools/spinlock.h>
++ */
++
++#define CI_HAVE_SPINLOCKS
++
++typedef ci_uintptr_t                          ci_lock_holder_t;
++#define ci_lock_thisthread            (ci_lock_holder_t)current                       
++#define ci_lock_no_holder     (ci_lock_holder_t)NULL
++
++typedef spinlock_t                    ci_lock_i;
++typedef spinlock_t                    ci_irqlock_i;
++typedef unsigned long                 ci_irqlock_state_t;
++
++#define IRQLOCK_CYCLES  500000
++
++#define ci_lock_ctor_i(l)             spin_lock_init(l)
++#define ci_lock_dtor_i(l)             do{}while(0)
++#define ci_lock_lock_i(l)             spin_lock(l)
++#define ci_lock_trylock_i(l)          spin_trylock(l)
++#define ci_lock_unlock_i(l)           spin_unlock(l)
++
++#define ci_irqlock_ctor_i(l)          spin_lock_init(l)
++#define ci_irqlock_dtor_i(l)          do{}while(0)
++#define ci_irqlock_lock_i(l,s)                spin_lock_irqsave(l,*(s))
++#define ci_irqlock_unlock_i(l,s)      spin_unlock_irqrestore(l, *(s))
++
++
++/**********************************************************************
++ * register access
++ */
++
++#include <asm/io.h>
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
++typedef volatile void __iomem*        ioaddr_t;
++#else
++typedef unsigned long ioaddr_t;
++#endif
++
++
++
++/**********************************************************************
++ * thread implementation -- kernel dependancies probably should be
++ * moved to driver/linux_kernel.h
++ */
++
++#define ci_linux_daemonize(name) daemonize(name)
++
++#include <linux/workqueue.h>
++
++
++typedef struct {
++  void*                       (*fn)(void* arg);
++  void*                       arg;
++  const char*         name;
++  int                 thrd_id;
++  struct completion   exit_event;
++  struct work_struct  keventd_witem;
++} ci_kernel_thread_t;
++
++
++typedef ci_kernel_thread_t* cithread_t;
++
++
++extern int cithread_create(cithread_t* tid, void* (*fn)(void*), void* arg,
++                         const char* name);
++extern int cithread_detach(cithread_t kt);
++extern int cithread_join(cithread_t kt);
++
++
++/* Kernel sysctl variables. */
++extern int sysctl_tcp_wmem[3];
++extern int sysctl_tcp_rmem[3];
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
++#define LINUX_HAS_SYSCTL_MEM_MAX
++extern ci_uint32 sysctl_wmem_max;
++extern ci_uint32 sysctl_rmem_max;
++#endif
++
++
++/*--------------------------------------------------------------------
++ *
++ * ci_bigbuf_t: An abstraction of a large buffer.  Needed because in the
++ * Linux kernel, large buffers need to be allocated with vmalloc(), whereas
++ * smaller buffers should use kmalloc().  This abstraction chooses the
++ * appropriate mechansim.
++ *
++ *--------------------------------------------------------------------*/
++
++typedef struct {
++  char*               p;
++  int         is_vmalloc;
++} ci_bigbuf_t;
++
++
++ci_inline int ci_bigbuf_alloc(ci_bigbuf_t* bb, size_t bytes) {
++  if( bytes >= CI_PAGE_SIZE && ! ci_in_atomic() ) {
++    bb->is_vmalloc = 1;
++    if( (bb->p = vmalloc(bytes)) )  return 0;
++  }
++  bb->is_vmalloc = 0;
++  bb->p = kmalloc(bytes, ci_in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
++  return bb->p ? 0 : -ENOMEM;
++}
++
++ci_inline void ci_bigbuf_free(ci_bigbuf_t* bb) {
++  if( bb->is_vmalloc )  vfree(bb->p);
++  else                  kfree(bb->p);
++}
++
++ci_inline char* ci_bigbuf_ptr(ci_bigbuf_t* bb)
++{ return bb->p; }
++
++/**********************************************************************
++ * struct iovec abstraction (for Windows port)
++ */
++
++typedef struct iovec ci_iovec;
++
++/* Accessors for buffer/length */
++#define CI_IOVEC_BASE(i) ((i)->iov_base)
++#define CI_IOVEC_LEN(i)  ((i)->iov_len)
++
++/**********************************************************************
++ * Signals
++ */
++
++ci_inline void
++ci_send_sig(int signum)
++{
++  send_sig(signum, current, 0);
++}
++
++#endif  /* __CI_TOOLS_LINUX_KERNEL_H__ */
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netback/ci/tools/sysdep.h

index 0000000,0000000..9be16dd

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/sysdep.h
@@@ -1,0 -1,0 +1,132 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*! \cidoxg_include_ci_tools */
++
++#ifndef __CI_TOOLS_SYSDEP_H__
++#define __CI_TOOLS_SYSDEP_H__
++
++/* Make this header self-sufficient */
++#include <ci/compat.h>
++#include <ci/tools/log.h>
++#include <ci/tools/debug.h>
++
++
++/**********************************************************************
++ * Platform dependencies.
++ */
++
++#if defined(__KERNEL__)
++
++# if defined(__linux__)
++#  include <ci/tools/platform/linux_kernel.h>
++# elif defined(_WIN32)
++#  include <ci/tools/platform/win32_kernel.h>
++# elif defined(__sun__)
++#  include <ci/tools/platform/sunos_kernel.h>
++# else
++#  error Unknown platform.
++# endif
++
++#elif defined(_WIN32)
++
++# include <ci/tools/platform/win32.h>
++
++#elif defined(__unix__)
++
++# include <ci/tools/platform/unix.h>
++
++#else
++
++# error Unknown platform.
++
++#endif
++
++#if defined(__linux__)
++/*! Linux sendfile() support enable/disable. */
++# define CI_HAVE_SENDFILE            /* provide sendfile i/f */
++
++# define CI_HAVE_OS_NOPAGE
++#endif
++
++#if defined(__sun__)
++# define CI_HAVE_SENDFILE          /* provide sendfile i/f */
++# define CI_HAVE_SENDFILEV           /* provide sendfilev i/f */
++
++# define CI_IOCTL_SENDFILE           /*  use efrm CI_SENDFILEV ioctl */
++#endif
++
++#if defined(_WIN32)
++typedef ci_uint32 ci_uerr_t; /* range of OS user-mode return codes */
++typedef ci_uint32 ci_kerr_t; /* range of OS kernel-mode return codes */
++#elif defined(__unix__)
++typedef ci_int32 ci_uerr_t; /* range of OS user-mode return codes */
++typedef ci_int32 ci_kerr_t; /* range of OS kernel-mode return codes */
++#endif
++
++
++/**********************************************************************
++ * Compiler and processor dependencies.
++ */
++
++#if defined(__GNUC__)
++
++#if defined(__i386__) || defined(__x86_64__)
++# include <ci/tools/platform/gcc_x86.h>
++#elif defined(__PPC__)
++#  include <ci/tools/platform/gcc_ppc.h>
++#elif defined(__ia64__)
++#  include <ci/tools/platform/gcc_ia64.h>
++#else
++# error Unknown processor.
++#endif
++
++#elif defined(_MSC_VER)
++
++#if defined(__i386__)
++# include <ci/tools/platform/msvc_x86.h>
++# elif defined(__x86_64__)
++# include <ci/tools/platform/msvc_x86_64.h>
++#else
++# error Unknown processor.
++#endif
++
++#elif defined(__PGI)
++
++# include <ci/tools/platform/pg_x86.h>
++
++#elif defined(__INTEL_COMPILER)
++
++/* Intel compilers v7 claim to be very gcc compatible. */
++# include <ci/tools/platform/gcc_x86.h>
++
++#else
++# error Unknown compiler.
++#endif
++
++
++#endif  /* __CI_TOOLS_SYSDEP_H__ */
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netfront/Makefile

index 0000000,0000000..0e4a54b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/Makefile
@@@ -1,0 -1,0 +1,11 @@@
++EXTRA_CFLAGS += -Idrivers/xen/sfc_netfront -Idrivers/xen/sfc_netutil -Idrivers/xen/netfront
++EXTRA_CFLAGS += -D__ci_driver__
++EXTRA_CFLAGS += -Werror
++
++ifdef GCOV
++EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
++endif
++
++obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND)   := sfc_netfront.o
++
++sfc_netfront-objs := accel_msg.o accel_bufs.o accel_netfront.o accel_vi.o accel_xenbus.o accel_tso.o accel_ssr.o accel_debugfs.o falcon_event.o falcon_vi.o pt_tx.o vi_init.o
diff --cc drivers/xen/sfc_netfront/accel.h

index 0000000,0000000..6e5add1

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel.h
@@@ -1,0 -1,0 +1,495 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#ifndef NETFRONT_ACCEL_H
++#define NETFRONT_ACCEL_H
++
++#include "accel_msg_iface.h"
++#include "accel_cuckoo_hash.h"
++#include "accel_bufs.h"
++
++#include "etherfabric/ef_vi.h"
++
++#include <xen/xenbus.h>
++#include <xen/evtchn.h>
++
++#include <linux/kernel.h>
++#include <linux/slab.h>
++#include <linux/version.h>
++#include <linux/list.h>
++
++enum netfront_accel_post_status {
++      NETFRONT_ACCEL_STATUS_GOOD,
++      NETFRONT_ACCEL_STATUS_BUSY,
++      NETFRONT_ACCEL_STATUS_CANT
++};
++
++#define NETFRONT_ACCEL_STATS 1
++#if NETFRONT_ACCEL_STATS
++#define NETFRONT_ACCEL_STATS_OP(x) x
++#else
++#define NETFRONT_ACCEL_STATS_OP(x)
++#endif
++
++
++enum netfront_accel_msg_state {
++      NETFRONT_ACCEL_MSG_NONE = 0,
++      NETFRONT_ACCEL_MSG_HELLO = 1,
++      NETFRONT_ACCEL_MSG_HW = 2
++};
++
++
++typedef struct {
++      u32 in_progress;
++      u32 total_len;
++      struct sk_buff *skb;
++} netfront_accel_jumbo_state;
++
++
++struct netfront_accel_ssr_state {
++      /** List of tracked connections. */
++      struct list_head conns;
++
++      /** Free efx_ssr_conn instances. */
++      struct list_head free_conns;
++};
++
++
++struct netfront_accel_netdev_stats {
++      /* Fastpath stats. */
++      u32 fastpath_rx_pkts;
++      u32 fastpath_rx_bytes;
++      u32 fastpath_rx_errors;
++      u32 fastpath_tx_pkts; 
++      u32 fastpath_tx_bytes;
++      u32 fastpath_tx_errors;
++};
++
++
++struct netfront_accel_netdev_dbfs {
++      struct dentry *fastpath_rx_pkts;
++      struct dentry *fastpath_rx_bytes;
++      struct dentry *fastpath_rx_errors;
++      struct dentry *fastpath_tx_pkts; 
++      struct dentry *fastpath_tx_bytes;
++      struct dentry *fastpath_tx_errors;
++};
++
++
++struct netfront_accel_stats {
++      /** Fast path events */
++      u64 fastpath_tx_busy;
++
++      /** TX DMA queue status */
++      u64 fastpath_tx_completions;
++
++      /** The number of events processed. */
++      u64 event_count;
++
++      /** Number of frame trunc events seen on fastpath */
++      u64 fastpath_frm_trunc;
++
++      /** Number of rx discard (bad crc) events seen on fastpath */
++      u64 fastpath_crc_bad;
++
++      /** Number of rx discard (bad csum) events seen on fastpath */
++      u64 fastpath_csum_bad;
++
++      /** Number of rx discard (bad rights) events seen on fastpath */
++      u64 fastpath_rights_bad;
++
++      /** Number of rx discard ("other") events seen on fastpath */
++      u64 fastpath_discard_other;
++
++      /** Number of no rx descriptor trunc events seen on fastpath */
++      u64 rx_no_desc_trunc;
++
++      /** The number of misc bad events processed. */
++      u64 bad_event_count;
++
++      /** Number of events dealt with in poll loop */
++      u32 events_per_poll_max;
++      u32 events_per_poll_tx_max;
++      u32 events_per_poll_rx_max;
++
++      /** Largest number of concurrently outstanding tx descriptors */
++      u32 fastpath_tx_pending_max;
++
++      /** The number of events since the last interrupts. */
++      u32 event_count_since_irq;
++
++      /** The max number of events between interrupts. */
++      u32 events_per_irq_max;
++
++      /** The number of interrupts. */
++      u64 irq_count;
++
++      /** The number of useless interrupts. */
++      u64 useless_irq_count;
++
++      /** The number of polls scheduled. */
++      u64 poll_schedule_count;
++
++      /** The number of polls called. */
++      u64 poll_call_count;
++
++      /** The number of rechecks. */
++      u64 poll_reschedule_count;
++
++      /** Number of times we've called netif_stop_queue/netif_wake_queue */
++      u64 queue_stops;
++      u64 queue_wakes;
++
++      /** SSR stats */
++      u64 ssr_bursts;
++      u64 ssr_drop_stream;
++      u64 ssr_misorder;
++      u64 ssr_slow_start;
++      u64 ssr_merges;
++      u64 ssr_too_many;
++      u64 ssr_new_stream;
++};
++
++
++struct netfront_accel_dbfs {
++      struct dentry *fastpath_tx_busy;
++      struct dentry *fastpath_tx_completions;
++      struct dentry *fastpath_tx_pending_max;
++      struct dentry *fastpath_frm_trunc;
++      struct dentry *fastpath_crc_bad;
++      struct dentry *fastpath_csum_bad;
++      struct dentry *fastpath_rights_bad;
++      struct dentry *fastpath_discard_other;
++      struct dentry *rx_no_desc_trunc;
++      struct dentry *event_count;
++      struct dentry *bad_event_count;
++      struct dentry *events_per_poll_max;
++      struct dentry *events_per_poll_rx_max;
++      struct dentry *events_per_poll_tx_max;
++      struct dentry *event_count_since_irq;
++      struct dentry *events_per_irq_max;
++      struct dentry *irq_count;
++      struct dentry *useless_irq_count;
++      struct dentry *poll_schedule_count;
++      struct dentry *poll_call_count;
++      struct dentry *poll_reschedule_count;
++      struct dentry *queue_stops;
++      struct dentry *queue_wakes;
++      struct dentry *ssr_bursts;
++      struct dentry *ssr_drop_stream;
++      struct dentry *ssr_misorder;
++      struct dentry *ssr_slow_start;
++      struct dentry *ssr_merges;
++      struct dentry *ssr_too_many;
++      struct dentry *ssr_new_stream;
++};
++
++
++typedef struct netfront_accel_vnic {
++      struct netfront_accel_vnic *next;
++      
++      struct mutex vnic_mutex;
++
++      spinlock_t tx_lock;
++
++      struct netfront_accel_bufpages bufpages;
++      struct netfront_accel_bufinfo *rx_bufs;
++      struct netfront_accel_bufinfo *tx_bufs;
++      
++      /** Hardware & VI state */
++      ef_vi vi;
++
++      ef_vi_state *vi_state;
++
++      ef_eventq_state evq_state;
++
++      void *evq_mapping;
++
++      /** Hardware dependant state */
++      union {
++              struct {
++                      /** Falcon A or B */
++                      enum net_accel_hw_type type; 
++                      u32 *evq_rptr;
++                      u32 *doorbell;
++                      void *evq_rptr_mapping;
++                      void *doorbell_mapping;
++                      void *txdmaq_mapping;
++                      void *rxdmaq_mapping;
++              } falcon;
++      } hw;
++  
++      /** RX DMA queue status */
++      u32 rx_dma_level;
++
++      /** Number of RX descriptors waiting to be pushed to the card. */
++      u32 rx_dma_batched;
++#define NETFRONT_ACCEL_RX_DESC_BATCH 16
++
++      /**
++       * Hash table of remote mac addresses to decide whether to try
++       * fast path
++       */
++      cuckoo_hash_table fastpath_table;
++      spinlock_t table_lock;
++
++      /** the local mac address of virtual interface we're accelerating */
++      u8 mac[ETH_ALEN];
++
++      int rx_pkt_stride;
++      int rx_skb_stride;
++
++      /**
++       * Keep track of fragments of jumbo packets as events are
++       * delivered by NIC 
++       */
++      netfront_accel_jumbo_state jumbo_state;
++
++      struct net_device *net_dev;
++
++      /** These two gate the enabling of fast path operations */
++      int frontend_ready;
++      int backend_netdev_up;
++
++      int irq_enabled;
++      spinlock_t irq_enabled_lock;
++
++      int tx_enabled;
++
++      int poll_enabled;
++
++      /** A spare slot for a TX packet.  This is treated as an
++       * extension of the DMA queue.  Reads require either
++       * netfront's tx_lock or the vnic tx_lock; writes require both
++       * locks */
++      struct sk_buff *tx_skb;
++
++      /** Keep track of fragments of SSR packets */
++      struct netfront_accel_ssr_state ssr_state;
++
++      struct xenbus_device *dev;
++
++      /** Event channel for messages */
++      int msg_channel;
++      int msg_channel_irq;
++
++      /** Event channel for network interrupts. */
++      int net_channel;
++      int net_channel_irq;
++
++      struct net_accel_shared_page *shared_page;
++
++      grant_ref_t ctrl_page_gnt;
++      grant_ref_t msg_page_gnt;
++
++      /** Message Qs, 1 each way. */
++      sh_msg_fifo2 to_dom0;
++      sh_msg_fifo2 from_dom0;
++
++      enum netfront_accel_msg_state msg_state;
++
++      /** Watch on accelstate */
++      struct xenbus_watch backend_accel_watch;
++      /** Watch on frontend's MAC address */
++      struct xenbus_watch mac_address_watch;
++
++      /** Work to process received irq/msg */
++      struct work_struct msg_from_bend;
++
++      /** Wait queue for changes in accelstate. */
++      wait_queue_head_t state_wait_queue;
++
++      /** The current accelstate of this driver. */
++      XenbusState frontend_state;
++
++      /** The most recent accelstate seen by the xenbus watch. */
++      XenbusState backend_state;
++
++      /** Non-zero if we should reject requests to connect. */
++      int removing;
++
++      /** Non-zero if the domU shared state has been initialised. */
++      int domU_state_is_setup;
++
++      /** Non-zero if the dom0 shared state has been initialised. */
++      int dom0_state_is_setup;
++
++      /* Those statistics that are added to the netdev stats */
++      struct netfront_accel_netdev_stats netdev_stats;
++      struct netfront_accel_netdev_stats stats_last_read;
++#ifdef CONFIG_DEBUG_FS
++      struct netfront_accel_netdev_dbfs netdev_dbfs;
++#endif
++
++      /* These statistics are internal and optional */
++#if NETFRONT_ACCEL_STATS
++      struct netfront_accel_stats stats;
++#ifdef CONFIG_DEBUG_FS
++      struct netfront_accel_dbfs dbfs;
++#endif
++#endif
++
++      /** Debufs fs dir for this interface */
++      struct dentry *dbfs_dir;
++} netfront_accel_vnic;
++
++
++/* Module parameters */
++extern unsigned sfc_netfront_max_pages;
++extern unsigned sfc_netfront_buffer_split;
++
++extern const char *frontend_name;
++extern struct netfront_accel_hooks accel_hooks;
++extern struct workqueue_struct *netfront_accel_workqueue;
++
++
++extern
++void netfront_accel_vi_ctor(netfront_accel_vnic *vnic);
++
++extern
++int netfront_accel_vi_init(netfront_accel_vnic *vnic, 
++                         struct net_accel_msg_hw *hw_msg);
++
++extern
++void netfront_accel_vi_dtor(netfront_accel_vnic *vnic);
++
++
++/**
++ * Add new buffers which have been registered with the NIC.
++ *
++ * @v   vnic     The vnic instance to process the response.
++ *
++ * The buffers contained in the message are added to the buffer pool.
++ */
++extern
++void netfront_accel_vi_add_bufs(netfront_accel_vnic *vnic, int is_rx);
++
++/**
++ * Put a packet on the tx DMA queue.
++ *
++ * @v  vnic    The vnic instance to accept the packet.
++ * @v  skb     A sk_buff to send.
++ *
++ * Attempt to send a packet.  On success, the skb is owned by the DMA
++ * queue and will be released when the completion event arrives.
++ */
++extern enum netfront_accel_post_status
++netfront_accel_vi_tx_post(netfront_accel_vnic *vnic,
++                        struct sk_buff *skb);
++
++
++/**
++ * Process events in response to an interrupt.
++ *
++ * @v   vnic       The vnic instance to poll.
++ * @v   rx_packets The maximum number of rx packets to process.
++ * @ret rx_done    The number of rx packets processed.
++ *
++ * The vnic will process events until there are no more events
++ * remaining or the specified number of rx packets has been processed.
++ * The split from the interrupt call is to allow Linux NAPI
++ * polling.
++ */
++extern
++int netfront_accel_vi_poll(netfront_accel_vnic *vnic, int rx_packets);
++
++
++/**
++ * Iterate over the fragments of a packet buffer.
++ *
++ * @v   skb      The packet buffer to examine.
++ * @v   idx      A variable name for the fragment index.
++ * @v   data     A variable name for the address of the fragment data.
++ * @v   length   A variable name for the fragment length.
++ * @v   code     A section of code to execute for each fragment.
++ *
++ * This macro iterates over the fragments in a packet buffer and
++ * executes the code for each of them.
++ */
++#define NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT(skb, frag_idx,               \
++                                               frag_data, frag_len,   \
++                                               code)                  \
++      do {                                                            \
++              int frag_idx;                                           \
++              void *frag_data;                                        \
++              unsigned int      frag_len;                             \
++                                                                      \
++              frag_data = skb->data;                                  \
++              frag_len = skb_headlen(skb);                            \
++              frag_idx = 0;                                           \
++              while (1) { /* For each fragment */                     \
++                      code;                                           \
++                      if (frag_idx >= skb_shinfo(skb)->nr_frags) {    \
++                              break;                                  \
++                      } else {                                        \
++                              skb_frag_t *fragment;                   \
++                              fragment = &skb_shinfo(skb)->frags[frag_idx]; \
++                              frag_len = fragment->size;              \
++                              frag_data = ((void*)page_address(fragment->page) \
++                                           + fragment->page_offset);  \
++                      };                                              \
++                      frag_idx++;                                     \
++              }                                                       \
++      } while(0)
++
++static inline
++void netfront_accel_disable_net_interrupts(netfront_accel_vnic *vnic)
++{
++      mask_evtchn(vnic->net_channel);
++}
++
++static inline
++void netfront_accel_enable_net_interrupts(netfront_accel_vnic *vnic)
++{
++      unmask_evtchn(vnic->net_channel);
++}
++
++void netfront_accel_msg_tx_fastpath(netfront_accel_vnic *vnic, const void *mac,
++                                  u32 ip, u16 port, u8 protocol);
++
++/* Process an IRQ received from back end driver */
++irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context);
++irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context);
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
++extern void netfront_accel_msg_from_bend(struct work_struct *context);
++#else
++extern void netfront_accel_msg_from_bend(void *context);
++#endif
++
++extern void vnic_stop_fastpath(netfront_accel_vnic *vnic);
++
++extern int netfront_accel_probe(struct net_device *net_dev, 
++                              struct xenbus_device *dev);
++extern int netfront_accel_remove(struct xenbus_device *dev);
++extern void netfront_accel_set_closing(netfront_accel_vnic *vnic);
++
++extern int netfront_accel_vi_enable_interrupts(netfront_accel_vnic *vnic);
++
++extern void netfront_accel_debugfs_init(void);
++extern void netfront_accel_debugfs_fini(void);
++extern int netfront_accel_debugfs_create(netfront_accel_vnic *vnic);
++extern int netfront_accel_debugfs_remove(netfront_accel_vnic *vnic);
++
++#endif /* NETFRONT_ACCEL_H */
diff --cc drivers/xen/sfc_netfront/accel_bufs.c

index 0000000,0000000..f96f73c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_bufs.c
@@@ -1,0 -1,0 +1,393 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <xen/gnttab.h>
++
++#include "accel_bufs.h"
++#include "accel_util.h"
++
++#include "accel.h"
++
++
++static int 
++netfront_accel_alloc_buf_desc_blocks(struct netfront_accel_bufinfo *manager,
++                                   int pages)
++{
++      manager->desc_blocks = 
++              kzalloc(sizeof(struct netfront_accel_pkt_desc *) * 
++                      NETFRONT_ACCEL_BUF_NUM_BLOCKS(pages), GFP_KERNEL);
++      if (manager->desc_blocks == NULL) {
++              return -ENOMEM;
++      }
++      
++      return 0;
++}
++
++static int 
++netfront_accel_alloc_buf_lists(struct netfront_accel_bufpages *bufpages,
++                             int pages)
++{
++      bufpages->page_list = kmalloc(pages * sizeof(void *), GFP_KERNEL);
++      if (bufpages->page_list == NULL) {
++              return -ENOMEM;
++      }
++
++      bufpages->grant_list = kzalloc(pages * sizeof(grant_ref_t), GFP_KERNEL);
++      if (bufpages->grant_list == NULL) {
++              kfree(bufpages->page_list);
++              bufpages->page_list = NULL;
++              return -ENOMEM;
++      }
++
++      return 0;
++}
++
++
++int netfront_accel_alloc_buffer_mem(struct netfront_accel_bufpages *bufpages,
++                                  struct netfront_accel_bufinfo *rx_manager,
++                                  struct netfront_accel_bufinfo *tx_manager,
++                                  int pages)
++{
++      int n, rc;
++
++      if ((rc = netfront_accel_alloc_buf_desc_blocks
++           (rx_manager, pages - (pages / sfc_netfront_buffer_split))) < 0) {
++              goto rx_fail;
++      }
++
++      if ((rc = netfront_accel_alloc_buf_desc_blocks
++           (tx_manager, pages / sfc_netfront_buffer_split)) < 0) {
++              goto tx_fail;
++      }
++
++      if ((rc = netfront_accel_alloc_buf_lists(bufpages, pages)) < 0) {
++              goto lists_fail;
++      }
++
++      for (n = 0; n < pages; n++) {
++              void *tmp = (void*)__get_free_page(GFP_KERNEL);
++              if (tmp == NULL)
++                      break;
++
++              bufpages->page_list[n] = tmp;
++      }
++
++      if (n != pages) {
++              EPRINTK("%s: not enough pages: %d != %d\n", __FUNCTION__, n, 
++                      pages);
++              for (; n >= 0; n--)
++                      free_page((unsigned long)(bufpages->page_list[n]));
++              rc = -ENOMEM;
++              goto pages_fail;
++      }
++
++      bufpages->max_pages = pages;
++      bufpages->page_reqs = 0;
++
++      return 0;
++
++ pages_fail:
++      kfree(bufpages->page_list);
++      kfree(bufpages->grant_list);
++
++      bufpages->page_list = NULL;
++      bufpages->grant_list = NULL;
++ lists_fail:
++      kfree(tx_manager->desc_blocks);
++      tx_manager->desc_blocks = NULL;
++
++ tx_fail:
++      kfree(rx_manager->desc_blocks);
++      rx_manager->desc_blocks = NULL;
++ rx_fail:
++      return rc;
++}
++
++
++void netfront_accel_free_buffer_mem(struct netfront_accel_bufpages *bufpages,
++                                  struct netfront_accel_bufinfo *rx_manager,
++                                  struct netfront_accel_bufinfo *tx_manager)
++{
++      int i;
++
++      for (i = 0; i < bufpages->max_pages; i++) {
++              if (bufpages->grant_list[i] != 0)
++                      net_accel_ungrant_page(bufpages->grant_list[i]);
++              free_page((unsigned long)(bufpages->page_list[i]));
++      }
++
++      if (bufpages->max_pages) {
++              kfree(bufpages->page_list);
++              kfree(bufpages->grant_list);
++              kfree(rx_manager->desc_blocks);
++              kfree(tx_manager->desc_blocks);
++      }
++}
++
++
++/*
++ * Allocate memory for the buffer manager and create a lock.  If no
++ * lock is supplied its own is allocated.
++ */
++struct netfront_accel_bufinfo *netfront_accel_init_bufs(spinlock_t *lock)
++{
++      struct netfront_accel_bufinfo *res = kmalloc(sizeof(*res), GFP_KERNEL);
++      if (res != NULL) {
++              res->npages = res->nused = 0;
++              res->first_free = -1;
++
++              if (lock == NULL) {
++                      res->lock = kmalloc(sizeof(*res->lock), GFP_KERNEL);
++                      if (res->lock == NULL) {
++                              kfree(res);
++                              return NULL;
++                      }
++                      spin_lock_init(res->lock);
++                      res->internally_locked = 1;
++              } else {
++                      res->lock = lock;
++                      res->internally_locked = 0;
++              }
++              
++              res->desc_blocks = NULL;
++      }
++
++      return res;
++}
++
++
++void netfront_accel_fini_bufs(struct netfront_accel_bufinfo *bufs)
++{
++      if (bufs->internally_locked)
++              kfree(bufs->lock);
++      kfree(bufs);
++}
++
++
++int netfront_accel_buf_map_request(struct xenbus_device *dev,
++                                 struct netfront_accel_bufpages *bufpages,
++                                 struct net_accel_msg *msg, 
++                                 int pages, int offset)
++{
++      int i, mfn;
++      int err;
++
++      net_accel_msg_init(msg, NET_ACCEL_MSG_MAPBUF);
++
++      BUG_ON(pages > NET_ACCEL_MSG_MAX_PAGE_REQ);
++
++      msg->u.mapbufs.pages = pages;
++
++      for (i = 0; i < msg->u.mapbufs.pages; i++) {
++              /* 
++               * This can happen if we tried to send this message
++               * earlier but the queue was full.
++               */
++              if (bufpages->grant_list[offset+i] != 0) {
++                      msg->u.mapbufs.grants[i] = 
++                              bufpages->grant_list[offset+i];
++                      continue;
++              }
++
++              mfn = virt_to_mfn(bufpages->page_list[offset+i]);
++              VPRINTK("%s: Granting page %d, mfn %08x\n",
++                      __FUNCTION__, i, mfn);
++
++              bufpages->grant_list[offset+i] =
++                      net_accel_grant_page(dev, mfn, 0);
++              msg->u.mapbufs.grants[i] = bufpages->grant_list[offset+i];
++
++              if (msg->u.mapbufs.grants[i] < 0) {
++                      EPRINTK("%s: Failed to grant buffer: %d\n",
++                              __FUNCTION__, msg->u.mapbufs.grants[i]);
++                      err = -EIO;
++                      goto error;
++              }
++      }
++
++      /* This is interpreted on return as the offset in the the page_list */
++      msg->u.mapbufs.reqid = offset;
++
++      return 0;
++
++error:
++      /* Ungrant all the pages we've successfully granted. */
++      for (i--; i >= 0; i--) {
++              net_accel_ungrant_page(bufpages->grant_list[offset+i]);
++              bufpages->grant_list[offset+i] = 0;
++      }
++      return err;
++}
++
++
++/* Process a response to a buffer request. */
++int netfront_accel_add_bufs(struct netfront_accel_bufpages *bufpages,
++                          struct netfront_accel_bufinfo *manager, 
++                          struct net_accel_msg *msg)
++{
++      int msg_pages, page_offset, i, newtot;
++      int old_block_count, new_block_count;
++      u32 msg_buf;
++      unsigned long flags;
++
++      VPRINTK("%s: manager %p msg %p\n", __FUNCTION__, manager, msg);
++
++      BUG_ON(msg->id != (NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY));
++
++      msg_pages = msg->u.mapbufs.pages;
++      msg_buf = msg->u.mapbufs.buf;
++      page_offset = msg->u.mapbufs.reqid;
++
++      spin_lock_irqsave(manager->lock, flags);
++      newtot = manager->npages + msg_pages;
++      old_block_count = 
++              (manager->npages + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK - 1) >>
++              NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
++      new_block_count = 
++              (newtot + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK - 1) >>
++              NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
++
++      for (i = old_block_count; i < new_block_count; i++) {
++              struct netfront_accel_pkt_desc *block;
++              if (manager->desc_blocks[i] != NULL) {
++                      VPRINTK("Not needed\n");
++                      continue;
++              }
++              block = kzalloc(NETFRONT_ACCEL_BUFS_PER_BLOCK * 
++                              sizeof(netfront_accel_pkt_desc), GFP_ATOMIC);
++              if (block == NULL) {
++                      spin_unlock_irqrestore(manager->lock, flags);
++                      return -ENOMEM;
++              }
++              manager->desc_blocks[i] = block;
++      }
++      for (i = manager->npages; i < newtot; i++) {
++              int k, j = i - manager->npages;
++              int block_num;
++              int block_idx;
++              struct netfront_accel_pkt_desc *pkt;
++
++              block_num = i >> NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
++              block_idx = (NETFRONT_ACCEL_BUFS_PER_PAGE*i)
++                      & (NETFRONT_ACCEL_BUFS_PER_BLOCK-1);
++
++              pkt = manager->desc_blocks[block_num] + block_idx;
++              
++              for (k = 0; k < NETFRONT_ACCEL_BUFS_PER_PAGE; k++) {
++                      BUG_ON(page_offset + j >= bufpages->max_pages);
++
++                      pkt[k].buf_id = NETFRONT_ACCEL_BUFS_PER_PAGE * i + k;
++                      pkt[k].pkt_kva = bufpages->page_list[page_offset + j] +
++                              (PAGE_SIZE/NETFRONT_ACCEL_BUFS_PER_PAGE) * k;
++                      pkt[k].pkt_buff_addr = msg_buf +
++                              (PAGE_SIZE/NETFRONT_ACCEL_BUFS_PER_PAGE) * 
++                              (NETFRONT_ACCEL_BUFS_PER_PAGE * j + k);
++                      pkt[k].next_free = manager->first_free;
++                      manager->first_free = pkt[k].buf_id;
++                      *(int*)(pkt[k].pkt_kva) = pkt[k].buf_id;
++
++                      VPRINTK("buf %d desc %p kva %p buffaddr %x\n",
++                              pkt[k].buf_id, &(pkt[k]), pkt[k].pkt_kva, 
++                              pkt[k].pkt_buff_addr);
++              }
++      }
++      manager->npages = newtot;
++      spin_unlock_irqrestore(manager->lock, flags);
++      VPRINTK("Added %d pages. Total is now %d\n", msg_pages,
++              manager->npages);
++      return 0;
++}
++
++
++netfront_accel_pkt_desc *
++netfront_accel_buf_find(struct netfront_accel_bufinfo *manager, u16 id)
++{
++      netfront_accel_pkt_desc *pkt;
++      int block_num = id >> NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT;
++      int block_idx = id & (NETFRONT_ACCEL_BUFS_PER_BLOCK - 1);
++      BUG_ON(id >= manager->npages * NETFRONT_ACCEL_BUFS_PER_PAGE);
++      BUG_ON(block_idx >= NETFRONT_ACCEL_BUFS_PER_BLOCK);
++      pkt = manager->desc_blocks[block_num] + block_idx;
++      return pkt;
++}
++
++
++/* Allocate a buffer from the buffer manager */
++netfront_accel_pkt_desc *
++netfront_accel_buf_get(struct netfront_accel_bufinfo *manager)
++{
++      int bufno = -1;
++      netfront_accel_pkt_desc *buf = NULL;
++      unsigned long flags = 0;
++
++      /* Any spare? */
++      if (manager->first_free == -1)
++              return NULL;
++      /* Take lock */
++      if (manager->internally_locked)
++              spin_lock_irqsave(manager->lock, flags);
++      bufno = manager->first_free;
++      if (bufno != -1) {
++              buf = netfront_accel_buf_find(manager, bufno);
++              manager->first_free = buf->next_free;
++              manager->nused++;
++      }
++      /* Release lock */
++      if (manager->internally_locked)
++              spin_unlock_irqrestore(manager->lock, flags);
++
++      /* Tell the world */
++      VPRINTK("Allocated buffer %i, buffaddr %x\n", bufno,
++              buf->pkt_buff_addr);
++
++      return buf;
++}
++
++
++/* Release a buffer back to the buffer manager pool */
++int netfront_accel_buf_put(struct netfront_accel_bufinfo *manager, u16 id)
++{
++      netfront_accel_pkt_desc *buf = netfront_accel_buf_find(manager, id);
++      unsigned long flags = 0;
++      unsigned was_empty = 0;
++      int bufno = id;
++
++      VPRINTK("Freeing buffer %i\n", id);
++      BUG_ON(id == (u16)-1);
++
++      if (manager->internally_locked)
++              spin_lock_irqsave(manager->lock, flags);
++
++      if (manager->first_free == -1)
++              was_empty = 1;
++
++      buf->next_free = manager->first_free;
++      manager->first_free = bufno;
++      manager->nused--;
++
++      if (manager->internally_locked)
++              spin_unlock_irqrestore(manager->lock, flags);
++
++      return was_empty;
++}
diff --cc drivers/xen/sfc_netfront/accel_bufs.h

index 0000000,0000000..4ff3eaa

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_bufs.h
@@@ -1,0 -1,0 +1,181 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#ifndef NETFRONT_ACCEL_BUFS_H
++#define NETFRONT_ACCEL_BUFS_H
++
++#include <linux/skbuff.h>
++#include <linux/spinlock.h>
++#include <xen/xenbus.h>
++
++#include "accel_msg_iface.h"
++
++
++/*! Buffer descriptor structure */
++typedef struct netfront_accel_pkt_desc {
++      int buf_id;
++      u32 pkt_buff_addr;
++      void *pkt_kva;
++      /* This is the socket buffer currently married to this buffer */
++      struct sk_buff *skb;
++      int next_free;
++} netfront_accel_pkt_desc;
++
++
++#define NETFRONT_ACCEL_DEFAULT_BUF_PAGES (384)
++#define NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT (4)
++#define NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK            \
++      (1 << (NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT))
++#define NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT (1)
++#define NETFRONT_ACCEL_BUFS_PER_PAGE                  \
++      (1 << (NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT))
++#define NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT           \
++      (NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT +     \
++       NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT)
++#define NETFRONT_ACCEL_BUFS_PER_BLOCK                 \
++      (1 << NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT)
++#define NETFRONT_ACCEL_BUF_NUM_BLOCKS(max_pages)                      \
++      (((max_pages)+NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK-1) /           \
++       NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK)
++
++/*! Buffer management structure. */
++struct netfront_accel_bufinfo {
++      /* number added to this manager */
++      unsigned npages;
++      /* number currently used from this manager */
++      unsigned nused;
++
++      int first_free;
++
++      int internally_locked;
++      spinlock_t *lock;
++
++      /*
++       * array of pointers (length NETFRONT_ACCEL_BUF_NUM_BLOCKS) to
++       * pkt descs
++       */
++      struct netfront_accel_pkt_desc **desc_blocks; 
++};
++
++
++struct netfront_accel_bufpages {
++      /* length of lists of pages/grants */
++      int max_pages;
++      /* list of pages allocated for network buffers */
++      void **page_list;
++      /* list of grants for the above pages */
++      grant_ref_t *grant_list;
++      
++      /* number of page requests that have been made */
++      unsigned page_reqs;
++};
++
++
++/*! Allocate memory for the buffer manager, set up locks etc.
++ * Optionally takes a lock to use, if not supplied it makes its own.
++ *
++ * \return pointer to netfront_accel_bufinfo structure that represents the
++ * buffer manager
++ */
++extern struct netfront_accel_bufinfo *
++netfront_accel_init_bufs(spinlock_t *lock);
++
++/*! Allocate memory for the buffers
++ */
++extern int
++netfront_accel_alloc_buffer_mem(struct netfront_accel_bufpages *bufpages,
++                              struct netfront_accel_bufinfo *rx_res,
++                              struct netfront_accel_bufinfo *tx_res,
++                              int pages);
++extern void
++netfront_accel_free_buffer_mem(struct netfront_accel_bufpages *bufpages,
++                             struct netfront_accel_bufinfo *rx_res,
++                             struct netfront_accel_bufinfo *tx_res);
++
++/*! Release memory for the buffer manager, buffers, etc.
++ *
++ * \param manager pointer to netfront_accel_bufinfo structure that
++ * represents the buffer manager
++ */
++extern void netfront_accel_fini_bufs(struct netfront_accel_bufinfo *manager);
++
++/*! Release a buffer.
++ *
++ * \param manager  The buffer manager which owns the buffer.
++ * \param id   The buffer identifier.
++ */
++extern int netfront_accel_buf_put(struct netfront_accel_bufinfo *manager, 
++                                u16 id);
++
++/*! Get the packet descriptor associated with a buffer id.
++ *
++ * \param manager  The buffer manager which owns the buffer.
++ * \param id       The buffer identifier.
++ *
++ * The returned value is the packet descriptor for this buffer.
++ */
++extern netfront_accel_pkt_desc *
++netfront_accel_buf_find(struct netfront_accel_bufinfo *manager, u16 id);
++
++
++/*! Fill out a message request for some buffers to be mapped by the
++ * back end driver
++ * 
++ * \param manager The buffer manager 
++ * \param msg Pointer to an ef_msg to complete.
++ * \return 0 on success
++ */
++extern int 
++netfront_accel_buf_map_request(struct xenbus_device *dev,
++                             struct netfront_accel_bufpages *bufpages,
++                             struct net_accel_msg *msg, 
++                             int pages, int offset);
++
++/*! Process a response to a buffer request. 
++ * 
++ * Deal with a received message from the back end in response to our
++ * request for buffers
++ * 
++ * \param manager The buffer manager
++ * \param msg The received message from the back end describing new
++ * buffers
++ * \return 0 on success
++ */
++extern int 
++netfront_accel_add_bufs(struct netfront_accel_bufpages *bufpages,
++                      struct netfront_accel_bufinfo *manager,
++                      struct net_accel_msg *msg);
++
++
++/*! Allocate a buffer from the buffer manager 
++ *
++ * \param manager The buffer manager data structure
++ * \param id On exit, the id of the buffer allocated
++ * \return Pointer to buffer descriptor.
++ */
++struct netfront_accel_pkt_desc *
++netfront_accel_buf_get(struct netfront_accel_bufinfo *manager);
++
++#endif /* NETFRONT_ACCEL_BUFS_H */
++
diff --cc drivers/xen/sfc_netfront/accel_debugfs.c

index 0000000,0000000..cd2d2c5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_debugfs.c
@@@ -1,0 -1,0 +1,227 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <linux/fs.h>
++#include <linux/debugfs.h>
++
++#include "accel.h"
++
++#if defined(CONFIG_DEBUG_FS)
++static struct dentry *sfc_debugfs_root = NULL;
++#endif
++
++void netfront_accel_debugfs_init(void) 
++{
++#if defined(CONFIG_DEBUG_FS)
++      sfc_debugfs_root = debugfs_create_dir(frontend_name, NULL);
++#endif
++}
++
++
++void netfront_accel_debugfs_fini(void)
++{
++#if defined(CONFIG_DEBUG_FS)
++      if (sfc_debugfs_root)
++              debugfs_remove(sfc_debugfs_root);
++#endif
++}
++
++
++int netfront_accel_debugfs_create(netfront_accel_vnic *vnic)
++{
++#if defined(CONFIG_DEBUG_FS)
++      if (sfc_debugfs_root == NULL)
++              return -ENOENT;
++
++      vnic->dbfs_dir = debugfs_create_dir(vnic->net_dev->name, 
++                                          sfc_debugfs_root);
++      if (vnic->dbfs_dir == NULL)
++              return -ENOMEM;
++
++      vnic->netdev_dbfs.fastpath_rx_pkts = debugfs_create_u32
++              ("fastpath_rx_pkts", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_pkts);
++      vnic->netdev_dbfs.fastpath_rx_bytes = debugfs_create_u32
++              ("fastpath_rx_bytes", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_bytes);
++      vnic->netdev_dbfs.fastpath_rx_errors = debugfs_create_u32
++              ("fastpath_rx_errors", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_errors);
++      vnic->netdev_dbfs.fastpath_tx_pkts = debugfs_create_u32
++              ("fastpath_tx_pkts", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_pkts);
++      vnic->netdev_dbfs.fastpath_tx_bytes = debugfs_create_u32
++              ("fastpath_tx_bytes", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_bytes);
++      vnic->netdev_dbfs.fastpath_tx_errors = debugfs_create_u32
++              ("fastpath_tx_errors", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_errors);
++
++#if NETFRONT_ACCEL_STATS
++      vnic->dbfs.irq_count = debugfs_create_u64
++              ("irq_count", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.irq_count);
++      vnic->dbfs.useless_irq_count = debugfs_create_u64
++              ("useless_irq_count", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.useless_irq_count);
++      vnic->dbfs.poll_schedule_count = debugfs_create_u64
++              ("poll_schedule_count", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.poll_schedule_count);
++      vnic->dbfs.poll_call_count = debugfs_create_u64
++              ("poll_call_count", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.poll_call_count);
++      vnic->dbfs.poll_reschedule_count = debugfs_create_u64
++              ("poll_reschedule_count", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.poll_reschedule_count);
++      vnic->dbfs.queue_stops = debugfs_create_u64
++              ("queue_stops", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.queue_stops);
++      vnic->dbfs.queue_wakes = debugfs_create_u64
++              ("queue_wakes", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.queue_wakes);
++      vnic->dbfs.ssr_bursts = debugfs_create_u64
++              ("ssr_bursts", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.ssr_bursts);
++      vnic->dbfs.ssr_drop_stream = debugfs_create_u64
++              ("ssr_drop_stream", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.ssr_drop_stream);
++      vnic->dbfs.ssr_misorder = debugfs_create_u64
++              ("ssr_misorder", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.ssr_misorder);
++      vnic->dbfs.ssr_slow_start = debugfs_create_u64
++              ("ssr_slow_start", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.ssr_slow_start);
++      vnic->dbfs.ssr_merges = debugfs_create_u64
++              ("ssr_merges", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.ssr_merges);
++      vnic->dbfs.ssr_too_many = debugfs_create_u64
++              ("ssr_too_many", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.ssr_too_many);
++      vnic->dbfs.ssr_new_stream = debugfs_create_u64
++              ("ssr_new_stream", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.ssr_new_stream);
++
++      vnic->dbfs.fastpath_tx_busy = debugfs_create_u64
++              ("fastpath_tx_busy", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.fastpath_tx_busy);
++      vnic->dbfs.fastpath_tx_completions = debugfs_create_u64
++              ("fastpath_tx_completions", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.fastpath_tx_completions);
++      vnic->dbfs.fastpath_tx_pending_max = debugfs_create_u32
++              ("fastpath_tx_pending_max", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.fastpath_tx_pending_max);
++      vnic->dbfs.event_count = debugfs_create_u64
++              ("event_count", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.event_count);
++      vnic->dbfs.bad_event_count = debugfs_create_u64
++              ("bad_event_count", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.bad_event_count);
++      vnic->dbfs.event_count_since_irq = debugfs_create_u32
++              ("event_count_since_irq", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.event_count_since_irq);
++      vnic->dbfs.events_per_irq_max = debugfs_create_u32
++              ("events_per_irq_max", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.events_per_irq_max);
++      vnic->dbfs.fastpath_frm_trunc = debugfs_create_u64
++              ("fastpath_frm_trunc", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.fastpath_frm_trunc);
++      vnic->dbfs.fastpath_crc_bad = debugfs_create_u64
++              ("fastpath_crc_bad", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.fastpath_crc_bad);
++      vnic->dbfs.fastpath_csum_bad = debugfs_create_u64
++              ("fastpath_csum_bad", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.fastpath_csum_bad);
++      vnic->dbfs.fastpath_rights_bad = debugfs_create_u64
++              ("fastpath_rights_bad", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.fastpath_rights_bad);
++      vnic->dbfs.fastpath_discard_other = debugfs_create_u64
++              ("fastpath_discard_other", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.fastpath_discard_other);
++      vnic->dbfs.rx_no_desc_trunc = debugfs_create_u64
++              ("rx_no_desc_trunc", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.rx_no_desc_trunc);
++      vnic->dbfs.events_per_poll_max = debugfs_create_u32
++              ("events_per_poll_max", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.events_per_poll_max);
++      vnic->dbfs.events_per_poll_rx_max = debugfs_create_u32
++              ("events_per_poll_rx_max", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.events_per_poll_rx_max);
++      vnic->dbfs.events_per_poll_tx_max = debugfs_create_u32
++              ("events_per_poll_tx_max", S_IRUSR | S_IRGRP | S_IROTH,
++               vnic->dbfs_dir, &vnic->stats.events_per_poll_tx_max);
++#endif
++#endif
++      return 0;
++}
++
++
++int netfront_accel_debugfs_remove(netfront_accel_vnic *vnic)
++{
++#if defined(CONFIG_DEBUG_FS)
++      if (vnic->dbfs_dir != NULL) {
++              debugfs_remove(vnic->netdev_dbfs.fastpath_rx_pkts);
++              debugfs_remove(vnic->netdev_dbfs.fastpath_rx_bytes);
++              debugfs_remove(vnic->netdev_dbfs.fastpath_rx_errors);
++              debugfs_remove(vnic->netdev_dbfs.fastpath_tx_pkts);
++              debugfs_remove(vnic->netdev_dbfs.fastpath_tx_bytes);
++              debugfs_remove(vnic->netdev_dbfs.fastpath_tx_errors);
++              
++#if NETFRONT_ACCEL_STATS
++              debugfs_remove(vnic->dbfs.irq_count);
++              debugfs_remove(vnic->dbfs.useless_irq_count);
++              debugfs_remove(vnic->dbfs.poll_schedule_count);
++              debugfs_remove(vnic->dbfs.poll_call_count);
++              debugfs_remove(vnic->dbfs.poll_reschedule_count);
++              debugfs_remove(vnic->dbfs.queue_stops);
++              debugfs_remove(vnic->dbfs.queue_wakes);
++              debugfs_remove(vnic->dbfs.ssr_bursts);
++              debugfs_remove(vnic->dbfs.ssr_drop_stream);
++              debugfs_remove(vnic->dbfs.ssr_misorder);
++              debugfs_remove(vnic->dbfs.ssr_slow_start);
++              debugfs_remove(vnic->dbfs.ssr_merges);
++              debugfs_remove(vnic->dbfs.ssr_too_many);
++              debugfs_remove(vnic->dbfs.ssr_new_stream);
++              
++              debugfs_remove(vnic->dbfs.fastpath_tx_busy);
++              debugfs_remove(vnic->dbfs.fastpath_tx_completions);
++              debugfs_remove(vnic->dbfs.fastpath_tx_pending_max);
++              debugfs_remove(vnic->dbfs.event_count);
++              debugfs_remove(vnic->dbfs.bad_event_count);
++              debugfs_remove(vnic->dbfs.event_count_since_irq);
++              debugfs_remove(vnic->dbfs.events_per_irq_max);
++              debugfs_remove(vnic->dbfs.fastpath_frm_trunc);
++              debugfs_remove(vnic->dbfs.fastpath_crc_bad);
++              debugfs_remove(vnic->dbfs.fastpath_csum_bad);
++              debugfs_remove(vnic->dbfs.fastpath_rights_bad);
++              debugfs_remove(vnic->dbfs.fastpath_discard_other);
++              debugfs_remove(vnic->dbfs.rx_no_desc_trunc);
++              debugfs_remove(vnic->dbfs.events_per_poll_max);
++              debugfs_remove(vnic->dbfs.events_per_poll_rx_max);
++              debugfs_remove(vnic->dbfs.events_per_poll_tx_max);
++#endif
++              debugfs_remove(vnic->dbfs_dir);
++      }
++#endif
++      return 0;
++}
diff --cc drivers/xen/sfc_netfront/accel_msg.c

index 0000000,0000000..045af8b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_msg.c
@@@ -1,0 -1,0 +1,567 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <linux/stddef.h>
++#include <linux/errno.h>
++
++#include <xen/xenbus.h>
++
++#include "accel.h"
++#include "accel_msg_iface.h"
++#include "accel_util.h"
++#include "accel_bufs.h"
++
++#include "netfront.h" /* drivers/xen/netfront/netfront.h */
++
++static void vnic_start_interrupts(netfront_accel_vnic *vnic)
++{
++      unsigned long flags;
++      
++      /* Prime our interrupt */
++      spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
++      if (!netfront_accel_vi_enable_interrupts(vnic)) {
++              struct netfront_info *np = netdev_priv(vnic->net_dev);
++
++              /* Cripes, that was quick, better pass it up */
++              netfront_accel_disable_net_interrupts(vnic);
++              vnic->irq_enabled = 0;
++              NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++);
++              napi_schedule(&np->napi);
++      } else {
++              /*
++               * Nothing yet, make sure we get interrupts through
++               * back end 
++               */
++              vnic->irq_enabled = 1;
++              netfront_accel_enable_net_interrupts(vnic);
++      }
++      spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
++}
++
++
++static void vnic_stop_interrupts(netfront_accel_vnic *vnic)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
++      netfront_accel_disable_net_interrupts(vnic);
++      vnic->irq_enabled = 0;
++      spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
++}
++
++
++static void vnic_start_fastpath(netfront_accel_vnic *vnic)
++{
++      struct net_device *net_dev = vnic->net_dev;
++      struct netfront_info *np = netdev_priv(net_dev);
++      unsigned long flags;
++
++      DPRINTK("%s\n", __FUNCTION__);
++
++      spin_lock_irqsave(&vnic->tx_lock, flags);
++      vnic->tx_enabled = 1;
++      spin_unlock_irqrestore(&vnic->tx_lock, flags);
++      
++      napi_disable(&np->napi);
++      vnic->poll_enabled = 1;
++      napi_enable(&np->napi);
++      
++      vnic_start_interrupts(vnic);
++}
++
++
++void vnic_stop_fastpath(netfront_accel_vnic *vnic)
++{
++      struct net_device *net_dev = vnic->net_dev;
++      struct netfront_info *np = (struct netfront_info *)netdev_priv(net_dev);
++      unsigned long flags1, flags2;
++
++      DPRINTK("%s\n", __FUNCTION__);
++
++      vnic_stop_interrupts(vnic);
++      
++      spin_lock_irqsave(&vnic->tx_lock, flags1);
++      vnic->tx_enabled = 0;
++      spin_lock_irqsave(&np->tx_lock, flags2);
++      if (vnic->tx_skb != NULL) {
++              dev_kfree_skb_any(vnic->tx_skb);
++              vnic->tx_skb = NULL;
++              if (netfront_check_queue_ready(net_dev)) {
++                      netif_wake_queue(net_dev);
++                      NETFRONT_ACCEL_STATS_OP
++                              (vnic->stats.queue_wakes++);
++              }
++      }
++      spin_unlock_irqrestore(&np->tx_lock, flags2);
++      spin_unlock_irqrestore(&vnic->tx_lock, flags1);
++      
++      /* Must prevent polls and hold lock to modify poll_enabled */
++      napi_disable(&np->napi);
++      spin_lock_irqsave(&vnic->irq_enabled_lock, flags1);
++      vnic->poll_enabled = 0;
++      spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags1);
++      napi_enable(&np->napi);
++}
++
++
++static void netfront_accel_interface_up(netfront_accel_vnic *vnic)
++{
++      if (!vnic->backend_netdev_up) {
++              vnic->backend_netdev_up = 1;
++              
++              if (vnic->frontend_ready)
++                      vnic_start_fastpath(vnic);
++      }
++}
++
++
++static void netfront_accel_interface_down(netfront_accel_vnic *vnic)
++{
++      if (vnic->backend_netdev_up) {
++              vnic->backend_netdev_up = 0;
++              
++              if (vnic->frontend_ready)
++                      vnic_stop_fastpath(vnic);
++      }
++}
++
++
++static int vnic_add_bufs(netfront_accel_vnic *vnic, 
++                       struct net_accel_msg *msg)
++{
++      int rc, offset;
++      struct netfront_accel_bufinfo *bufinfo;
++  
++      BUG_ON(msg->u.mapbufs.pages > NET_ACCEL_MSG_MAX_PAGE_REQ);
++
++      offset = msg->u.mapbufs.reqid;
++
++      if (offset < vnic->bufpages.max_pages - 
++          (vnic->bufpages.max_pages / sfc_netfront_buffer_split)) {
++              bufinfo = vnic->rx_bufs;
++      } else
++              bufinfo = vnic->tx_bufs;
++
++      /* Queue up some Rx buffers to start things off. */
++      if ((rc = netfront_accel_add_bufs(&vnic->bufpages, bufinfo, msg)) == 0) {
++              netfront_accel_vi_add_bufs(vnic, bufinfo == vnic->rx_bufs);
++
++              if (offset + msg->u.mapbufs.pages == vnic->bufpages.max_pages) {
++                      VPRINTK("%s: got all buffers back\n", __FUNCTION__);
++                      vnic->frontend_ready = 1;
++                      if (vnic->backend_netdev_up)
++                              vnic_start_fastpath(vnic);
++              } else {
++                      VPRINTK("%s: got buffers back %d %d\n", __FUNCTION__, 
++                              offset, msg->u.mapbufs.pages);
++              }
++      }
++
++      return rc;
++}
++
++
++/* The largest [o] such that (1u << o) <= n.  Requires n > 0. */
++
++inline unsigned log2_le(unsigned long n) {
++      unsigned order = 1;
++      while ((1ul << order) <= n) ++order;
++      return (order - 1);
++}
++
++static int vnic_send_buffer_requests(netfront_accel_vnic *vnic,
++                                   struct netfront_accel_bufpages *bufpages)
++{
++      int pages, offset, rc = 0, sent = 0;
++      struct net_accel_msg msg;
++
++      while (bufpages->page_reqs < bufpages->max_pages) {
++              offset = bufpages->page_reqs;
++
++              pages = pow2(log2_le(bufpages->max_pages - 
++                                   bufpages->page_reqs));
++              pages = pages < NET_ACCEL_MSG_MAX_PAGE_REQ ? 
++                      pages : NET_ACCEL_MSG_MAX_PAGE_REQ;
++
++              BUG_ON(offset < 0);
++              BUG_ON(pages <= 0);
++
++              rc = netfront_accel_buf_map_request(vnic->dev, bufpages,
++                                                  &msg, pages, offset);
++              if (rc == 0) {
++                      rc = net_accel_msg_send(vnic->shared_page, 
++                                              &vnic->to_dom0, &msg);
++                      if (rc < 0) {
++                              VPRINTK("%s: queue full, stopping for now\n",
++                                      __FUNCTION__);
++                              break;
++                      }
++                      sent++;
++              } else {
++                      EPRINTK("%s: problem with grant, stopping for now\n",
++                              __FUNCTION__);
++                      break;
++              }
++
++              bufpages->page_reqs += pages;
++      }
++
++      if (sent)
++              net_accel_msg_notify(vnic->msg_channel_irq);
++
++      return rc;
++}
++
++
++/*
++ * In response to dom0 saying "my queue is full", we reply with this
++ * when it is no longer full
++ */
++inline void vnic_set_queue_not_full(netfront_accel_vnic *vnic)
++{
++
++      if (test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B,
++                          (unsigned long *)&vnic->shared_page->aflags))
++              notify_remote_via_irq(vnic->msg_channel_irq);
++      else
++              VPRINTK("queue not full bit already set, not signalling\n");
++}
++
++/* 
++ * Notify dom0 that the queue we want to use is full, it should
++ * respond by setting MSG_AFLAGS_QUEUEUNOTFULL in due course
++ */
++inline void vnic_set_queue_full(netfront_accel_vnic *vnic)
++{
++
++      if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B,
++                           (unsigned long *)&vnic->shared_page->aflags))
++              notify_remote_via_irq(vnic->msg_channel_irq);
++      else
++              VPRINTK("queue full bit already set, not signalling\n");
++}
++
++
++static int vnic_check_hello_version(unsigned version) 
++{
++      if (version > NET_ACCEL_MSG_VERSION) {
++              /* Newer protocol, we must refuse */
++              return -EPROTO;
++      }
++
++      if (version < NET_ACCEL_MSG_VERSION) {
++              /*
++               * We are newer, so have discretion to accept if we
++               * wish.  For now however, just reject
++               */
++              return -EPROTO;
++      }
++
++      BUG_ON(version != NET_ACCEL_MSG_VERSION);
++      return 0;
++}
++
++
++static int vnic_process_hello_msg(netfront_accel_vnic *vnic,
++                                struct net_accel_msg *msg)
++{
++      int err = 0;
++      unsigned pages = sfc_netfront_max_pages;
++
++      if (vnic_check_hello_version(msg->u.hello.version) < 0) {
++              msg->id = NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_REPLY 
++                      | NET_ACCEL_MSG_ERROR;
++              msg->u.hello.version = NET_ACCEL_MSG_VERSION;
++      } else {
++              vnic->backend_netdev_up
++                      = vnic->shared_page->net_dev_up;
++              
++              msg->id = NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_REPLY;
++              msg->u.hello.version = NET_ACCEL_MSG_VERSION;
++              if (msg->u.hello.max_pages &&
++                  msg->u.hello.max_pages < pages)
++                      pages = msg->u.hello.max_pages;
++              msg->u.hello.max_pages = pages;
++              
++              /* Half of pages for rx, half for tx */ 
++              err = netfront_accel_alloc_buffer_mem(&vnic->bufpages,
++                                                    vnic->rx_bufs, 
++                                                    vnic->tx_bufs,
++                                                    pages);
++              if (err)
++                      msg->id |= NET_ACCEL_MSG_ERROR;         
++      }
++      
++      /* Send reply */
++      net_accel_msg_reply_notify(vnic->shared_page, vnic->msg_channel_irq,
++                                 &vnic->to_dom0, msg);
++      return err;
++}
++
++
++static int vnic_process_localmac_msg(netfront_accel_vnic *vnic,
++                                   struct net_accel_msg *msg)
++{
++      unsigned long flags;
++      cuckoo_hash_mac_key key;
++
++      if (msg->u.localmac.flags & NET_ACCEL_MSG_ADD) {
++              DPRINTK("MAC has moved, could be local: %pM\n",
++                      msg->u.localmac.mac);
++              key = cuckoo_mac_to_key(msg->u.localmac.mac);
++              spin_lock_irqsave(&vnic->table_lock, flags);
++              /* Try to remove it, not a big deal if not there */
++              cuckoo_hash_remove(&vnic->fastpath_table, 
++                                 (cuckoo_hash_key *)&key);
++              spin_unlock_irqrestore(&vnic->table_lock, flags);
++      }
++      
++      return 0;
++}
++
++
++static 
++int vnic_process_rx_msg(netfront_accel_vnic *vnic,
++                      struct net_accel_msg *msg)
++{
++      int err;
++
++      switch (msg->id) {
++      case NET_ACCEL_MSG_HELLO:
++              /* Hello, reply with Reply */
++              DPRINTK("got Hello, with version %.8x\n",
++                      msg->u.hello.version);
++              BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_NONE);
++              err = vnic_process_hello_msg(vnic, msg);
++              if (err == 0)
++                      vnic->msg_state = NETFRONT_ACCEL_MSG_HELLO;
++              break;
++      case NET_ACCEL_MSG_SETHW:
++              /* Hardware info message */
++              DPRINTK("got H/W info\n");
++              BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HELLO);
++              err = netfront_accel_vi_init(vnic, &msg->u.hw);
++              if (err == 0)
++                      vnic->msg_state = NETFRONT_ACCEL_MSG_HW;
++              break;
++      case NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY:
++              VPRINTK("Got mapped buffers back\n");
++              BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
++              err = vnic_add_bufs(vnic, msg);
++              break;
++      case NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_ERROR:
++              /* No buffers.  Can't use the fast path. */
++              EPRINTK("Got mapped buffers error.  Cannot accelerate.\n");
++              BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
++              err = -EIO;
++              break;
++      case NET_ACCEL_MSG_LOCALMAC:
++              /* Should be add, remove not currently used */
++              EPRINTK_ON(!(msg->u.localmac.flags & NET_ACCEL_MSG_ADD));
++              BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
++              err = vnic_process_localmac_msg(vnic, msg);
++              break;
++      default:
++              EPRINTK("Huh? Message code is 0x%x\n", msg->id);
++              err = -EPROTO;
++              break;
++      }
++
++      return err;
++}
++
++
++/* Process an IRQ received from back end driver */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
++void netfront_accel_msg_from_bend(struct work_struct *context)
++#else
++void netfront_accel_msg_from_bend(void *context)
++#endif
++{
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
++      netfront_accel_vnic *vnic = 
++              container_of(context, netfront_accel_vnic, msg_from_bend);
++#else
++      netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
++#endif
++      struct net_accel_msg msg;
++      int err, queue_was_full = 0;
++      
++      mutex_lock(&vnic->vnic_mutex);
++
++      /*
++       * This happens when the shared pages have been unmapped but
++       * the workqueue has yet to be flushed 
++       */
++      if (!vnic->dom0_state_is_setup) 
++              goto unlock_out;
++
++      while ((vnic->shared_page->aflags & NET_ACCEL_MSG_AFLAGS_TO_DOMU_MASK)
++             != 0) {
++              if (vnic->shared_page->aflags &
++                  NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL) {
++                      /* We've been told there may now be space. */
++                      clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B,
++                                (unsigned long *)&vnic->shared_page->aflags);
++              }
++
++              if (vnic->shared_page->aflags &
++                  NET_ACCEL_MSG_AFLAGS_QUEUE0FULL) {
++                      /*
++                       * There will be space at the end of this
++                       * function if we can make any.
++                       */
++                      clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B,
++                                (unsigned long *)&vnic->shared_page->aflags);
++                      queue_was_full = 1;
++              }
++
++              if (vnic->shared_page->aflags &
++                  NET_ACCEL_MSG_AFLAGS_NETUPDOWN) {
++                      DPRINTK("%s: net interface change\n", __FUNCTION__);
++                      clear_bit(NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B,
++                                (unsigned long *)&vnic->shared_page->aflags);
++                      if (vnic->shared_page->net_dev_up)
++                              netfront_accel_interface_up(vnic);
++                      else
++                              netfront_accel_interface_down(vnic);
++              }
++      }
++
++      /* Pull msg out of shared memory */
++      while ((err = net_accel_msg_recv(vnic->shared_page, &vnic->from_dom0,
++                                       &msg)) == 0) {
++              err = vnic_process_rx_msg(vnic, &msg);
++              
++              if (err != 0)
++                      goto done;
++      }
++
++      /*
++       * Send any pending buffer map request messages that we can,
++       * and mark domU->dom0 as full if necessary.  
++       */
++      if (vnic->msg_state == NETFRONT_ACCEL_MSG_HW &&
++          vnic->bufpages.page_reqs < vnic->bufpages.max_pages) {
++              if (vnic_send_buffer_requests(vnic, &vnic->bufpages) == -ENOSPC)
++                      vnic_set_queue_full(vnic);
++      }
++
++      /* 
++       * If there are no messages then this is not an error.  It
++       * just means that we've finished processing the queue.
++       */
++      if (err == -ENOENT)
++              err = 0;
++ done:
++      /* We will now have made space in the dom0->domU queue if we can */
++      if (queue_was_full)
++              vnic_set_queue_not_full(vnic);
++
++      if (err != 0) {
++              EPRINTK("%s returned %d\n", __FUNCTION__, err);
++              netfront_accel_set_closing(vnic);
++      }
++
++ unlock_out:
++      mutex_unlock(&vnic->vnic_mutex);
++
++      return;
++}
++
++
++irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context)
++{
++      netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
++      VPRINTK("irq %d from device %s\n", irq, vnic->dev->nodename);
++
++      queue_work(netfront_accel_workqueue, &vnic->msg_from_bend);
++
++      return IRQ_HANDLED;
++}
++
++/* Process an interrupt received from the NIC via backend */
++irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context)
++{
++      netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
++      struct net_device *net_dev = vnic->net_dev;
++      unsigned long flags;
++
++      VPRINTK("net irq %d from device %s\n", irq, vnic->dev->nodename);
++      
++      NETFRONT_ACCEL_STATS_OP(vnic->stats.irq_count++);
++
++      BUG_ON(net_dev==NULL);
++
++      spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
++      if (vnic->irq_enabled) {
++              struct netfront_info *np = netdev_priv(net_dev);
++
++              netfront_accel_disable_net_interrupts(vnic);
++              vnic->irq_enabled = 0;
++              spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
++
++#if NETFRONT_ACCEL_STATS
++              vnic->stats.poll_schedule_count++;
++              if (vnic->stats.event_count_since_irq >
++                  vnic->stats.events_per_irq_max)
++                      vnic->stats.events_per_irq_max = 
++                              vnic->stats.event_count_since_irq;
++              vnic->stats.event_count_since_irq = 0;
++#endif
++              napi_schedule(&np->napi);
++      }
++      else {
++              spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
++              NETFRONT_ACCEL_STATS_OP(vnic->stats.useless_irq_count++);
++              DPRINTK("%s: irq when disabled\n", __FUNCTION__);
++      }
++      
++      return IRQ_HANDLED;
++}
++
++
++void netfront_accel_msg_tx_fastpath(netfront_accel_vnic *vnic, const void *mac,
++                                  u32 ip, u16 port, u8 protocol)
++{
++      unsigned long lock_state;
++      struct net_accel_msg *msg;
++
++      msg = net_accel_msg_start_send(vnic->shared_page, &vnic->to_dom0,
++                                     &lock_state);
++
++      if (msg == NULL)
++              return;
++
++      net_accel_msg_init(msg, NET_ACCEL_MSG_FASTPATH);
++      msg->u.fastpath.flags = NET_ACCEL_MSG_REMOVE;
++      memcpy(msg->u.fastpath.mac, mac, ETH_ALEN);
++
++      msg->u.fastpath.port = port;
++      msg->u.fastpath.ip = ip;
++      msg->u.fastpath.proto = protocol;
++
++      net_accel_msg_complete_send_notify(vnic->shared_page, &vnic->to_dom0, 
++                                         &lock_state, vnic->msg_channel_irq);
++}
diff --cc drivers/xen/sfc_netfront/accel_netfront.c

index 0000000,0000000..79c69e5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_netfront.c
@@@ -1,0 -1,0 +1,328 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <linux/skbuff.h>
++#include <linux/netdevice.h>
++
++/* drivers/xen/netfront/netfront.h */
++#include "netfront.h"
++
++#include "accel.h"
++#include "accel_bufs.h"
++#include "accel_util.h"
++#include "accel_msg_iface.h"
++#include "accel_ssr.h"
++ 
++#ifdef EFX_GCOV
++#include "gcov.h"
++#endif
++
++#define NETFRONT_ACCEL_VNIC_FROM_NETDEV(_nd)                          \
++      ((netfront_accel_vnic *)((struct netfront_info *)netdev_priv(net_dev))->accel_priv)
++
++static int netfront_accel_netdev_start_xmit(struct sk_buff *skb,
++                                          struct net_device *net_dev)
++{
++      netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
++      struct netfront_info *np = 
++              (struct netfront_info *)netdev_priv(net_dev);
++      int handled, rc;
++      unsigned long flags1, flags2;
++
++      BUG_ON(vnic == NULL);
++
++      /* Take our tx lock and hold for the duration */
++      spin_lock_irqsave(&vnic->tx_lock, flags1);
++
++      if (!vnic->tx_enabled) {
++              rc = 0;
++              goto unlock_out;
++      }
++
++      handled = netfront_accel_vi_tx_post(vnic, skb);
++      if (handled == NETFRONT_ACCEL_STATUS_BUSY) {
++              BUG_ON(vnic->net_dev != net_dev);
++              DPRINTK("%s stopping queue\n", __FUNCTION__);
++
++              /* Need netfront's tx_lock and vnic tx_lock to write tx_skb */
++              spin_lock_irqsave(&np->tx_lock, flags2);
++              BUG_ON(vnic->tx_skb != NULL);
++              vnic->tx_skb = skb;
++              netif_stop_queue(net_dev);
++              spin_unlock_irqrestore(&np->tx_lock, flags2);
++
++              NETFRONT_ACCEL_STATS_OP(vnic->stats.queue_stops++);
++      }
++
++      if (handled == NETFRONT_ACCEL_STATUS_CANT)
++              rc = 0;
++      else
++              rc = 1;
++
++unlock_out:
++      spin_unlock_irqrestore(&vnic->tx_lock, flags1);
++
++      return rc;
++}
++
++
++static int netfront_accel_netdev_poll(struct net_device *net_dev, int *budget)
++{
++      netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
++      int rx_allowed = *budget, rx_done;
++      
++      BUG_ON(vnic == NULL);
++
++      /* Can check this without lock as modifier excludes polls */ 
++      if (!vnic->poll_enabled)
++              return 0;
++
++      rx_done = netfront_accel_vi_poll(vnic, rx_allowed);
++      *budget -= rx_done;
++      
++      NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_call_count++);
++
++      VPRINTK("%s: done %d allowed %d\n",
++              __FUNCTION__, rx_done, rx_allowed);
++
++      netfront_accel_ssr_end_of_burst(vnic, &vnic->ssr_state);
++
++      if (rx_done < rx_allowed) {
++               return 0; /* Done */
++      }
++      
++      NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_reschedule_count++);
++
++      return 1; /* More to do. */
++}
++
++
++/*
++ * Process request from netfront to start napi interrupt
++ * mode. (i.e. enable interrupts as it's finished polling)
++ */
++static int netfront_accel_start_napi_interrupts(struct net_device *net_dev) 
++{
++      netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
++      unsigned long flags;
++
++      BUG_ON(vnic == NULL);
++      
++      /*
++       * Can check this without lock as writer excludes poll before
++       * modifying
++       */
++      if (!vnic->poll_enabled)
++              return 0;
++
++      if (!netfront_accel_vi_enable_interrupts(vnic)) {
++              /* 
++               * There was something there, tell caller we had
++               * something to do.
++               */
++              return 1;
++      }
++
++      spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
++      vnic->irq_enabled = 1;
++      netfront_accel_enable_net_interrupts(vnic);
++      spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
++
++      return 0;
++}
++
++
++/*
++ * Process request from netfront to stop napi interrupt
++ * mode. (i.e. disable interrupts as it's starting to poll 
++ */
++static void netfront_accel_stop_napi_interrupts(struct net_device *net_dev) 
++{
++      netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
++      unsigned long flags;
++
++      BUG_ON(vnic == NULL);
++
++      spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
++
++      if (!vnic->poll_enabled) {
++              spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
++              return;
++      }
++
++      netfront_accel_disable_net_interrupts(vnic);
++      vnic->irq_enabled = 0;
++      spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
++}
++
++
++static int netfront_accel_check_ready(struct net_device *net_dev)
++{
++      netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
++
++      BUG_ON(vnic == NULL);
++
++      /* Read of tx_skb is protected by netfront's tx_lock */ 
++      return vnic->tx_skb == NULL;
++}
++
++
++static int netfront_accel_get_stats(struct net_device *net_dev,
++                                  struct net_device_stats *stats)
++{
++      netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
++      struct netfront_accel_netdev_stats now;
++
++      BUG_ON(vnic == NULL);
++
++      now.fastpath_rx_pkts   = vnic->netdev_stats.fastpath_rx_pkts;
++      now.fastpath_rx_bytes  = vnic->netdev_stats.fastpath_rx_bytes;
++      now.fastpath_rx_errors = vnic->netdev_stats.fastpath_rx_errors;
++      now.fastpath_tx_pkts   = vnic->netdev_stats.fastpath_tx_pkts;
++      now.fastpath_tx_bytes  = vnic->netdev_stats.fastpath_tx_bytes;
++      now.fastpath_tx_errors = vnic->netdev_stats.fastpath_tx_errors;
++      
++      stats->rx_packets += (now.fastpath_rx_pkts - 
++                            vnic->stats_last_read.fastpath_rx_pkts);
++      stats->rx_bytes   += (now.fastpath_rx_bytes -
++                            vnic->stats_last_read.fastpath_rx_bytes);
++      stats->rx_errors  += (now.fastpath_rx_errors - 
++                            vnic->stats_last_read.fastpath_rx_errors);
++      stats->tx_packets += (now.fastpath_tx_pkts - 
++                            vnic->stats_last_read.fastpath_tx_pkts);
++      stats->tx_bytes   += (now.fastpath_tx_bytes - 
++                            vnic->stats_last_read.fastpath_tx_bytes);
++      stats->tx_errors  += (now.fastpath_tx_errors - 
++                            vnic->stats_last_read.fastpath_tx_errors);
++      
++      vnic->stats_last_read = now;
++
++      return 0;
++}
++
++
++struct netfront_accel_hooks accel_hooks = {
++      .new_device         = &netfront_accel_probe,
++      .remove         = &netfront_accel_remove,
++      .netdev_poll       = &netfront_accel_netdev_poll,
++      .start_xmit         = &netfront_accel_netdev_start_xmit,
++      .start_napi_irq = &netfront_accel_start_napi_interrupts,
++      .stop_napi_irq   = &netfront_accel_stop_napi_interrupts,
++      .check_ready       = &netfront_accel_check_ready,
++      .get_stats           = &netfront_accel_get_stats
++};
++
++
++unsigned sfc_netfront_max_pages = NETFRONT_ACCEL_DEFAULT_BUF_PAGES;
++module_param_named (max_pages, sfc_netfront_max_pages, uint, 0644);
++MODULE_PARM_DESC(max_pages, "Number of buffer pages to request");
++
++unsigned sfc_netfront_buffer_split = 2;
++module_param_named (buffer_split, sfc_netfront_buffer_split, uint, 0644);
++MODULE_PARM_DESC(buffer_split, 
++               "Fraction of buffers to use for TX, rest for RX");
++
++
++const char *frontend_name = "sfc_netfront";
++
++struct workqueue_struct *netfront_accel_workqueue;
++
++static int __init netfront_accel_init(void)
++{
++      int rc;
++#ifdef EFX_GCOV       
++      gcov_provider_init(THIS_MODULE);
++#endif
++
++      /*
++       * If we're running on dom0, netfront hasn't initialised
++       * itself, so we need to keep away
++       */
++      if (is_initial_xendomain())
++              return 0;
++
++      if (!is_pow2(sizeof(struct net_accel_msg)))
++              EPRINTK("%s: bad structure size\n", __FUNCTION__);
++
++      netfront_accel_workqueue = create_workqueue(frontend_name);
++
++      netfront_accel_debugfs_init();
++
++      rc = netfront_accelerator_loaded(NETFRONT_ACCEL_VERSION,
++                                       frontend_name, &accel_hooks);
++
++      if (rc < 0) {
++              EPRINTK("Xen netfront accelerator version mismatch\n");
++              goto fail;
++      }
++
++      if (rc > 0) {
++              /* 
++               * In future may want to add backwards compatibility
++               * and accept certain subsets of previous versions
++               */
++              EPRINTK("Xen netfront accelerator version mismatch\n");
++              goto fail;
++      }
++
++      return 0;
++
++ fail:
++      netfront_accel_debugfs_fini();
++      flush_workqueue(netfront_accel_workqueue);
++      destroy_workqueue(netfront_accel_workqueue);
++#ifdef EFX_GCOV
++      gcov_provider_fini(THIS_MODULE);
++#endif
++      return -EINVAL;
++}
++module_init(netfront_accel_init);
++
++static void __exit netfront_accel_exit(void)
++{
++      if (is_initial_xendomain())
++              return;
++
++      DPRINTK("%s: unhooking\n", __FUNCTION__);
++
++      /* Unhook from normal netfront */
++      netfront_accelerator_stop(frontend_name);
++
++      DPRINTK("%s: done\n", __FUNCTION__);
++
++      netfront_accel_debugfs_fini();
++
++      flush_workqueue(netfront_accel_workqueue);
++
++      destroy_workqueue(netfront_accel_workqueue);
++
++#ifdef EFX_GCOV
++      gcov_provider_fini(THIS_MODULE);
++#endif
++      return;
++}
++module_exit(netfront_accel_exit);
++
++MODULE_LICENSE("GPL");
++
diff --cc drivers/xen/sfc_netfront/accel_ssr.c

index 0000000,0000000..9c44144

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_ssr.c
@@@ -1,0 -1,0 +1,308 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <linux/socket.h>
++#include <linux/in.h>
++#include <linux/ip.h>
++#include <linux/tcp.h>
++#include <linux/list.h>
++#include <net/ip.h>
++#include <net/checksum.h>
++
++#include "accel.h"
++#include "accel_util.h"
++#include "accel_bufs.h"
++
++#include "accel_ssr.h"
++
++static inline int list_valid(struct list_head *lh) {
++      return(lh->next != NULL);
++}
++
++static void netfront_accel_ssr_deliver (struct netfront_accel_vnic *vnic,
++                                      struct netfront_accel_ssr_state *st,
++                                      struct netfront_accel_ssr_conn *c);
++
++/** Construct an efx_ssr_state.
++ *
++ * @v st     The SSR state (per channel per port)
++ * @v port   The port.
++ */
++void netfront_accel_ssr_init(struct netfront_accel_ssr_state *st) {
++      unsigned i;
++
++      INIT_LIST_HEAD(&st->conns);
++      INIT_LIST_HEAD(&st->free_conns);
++      for (i = 0; i < 8; ++i) {
++              struct netfront_accel_ssr_conn *c = 
++                      kmalloc(sizeof(*c), GFP_KERNEL);
++              if (c == NULL)  break;
++              c->n_in_order_pkts = 0;
++              c->skb = NULL;
++              list_add(&c->link, &st->free_conns);
++      }
++
++}
++
++
++/** Destructor for an efx_ssr_state.
++ *
++ * @v st     The SSR state (per channel per port)
++ */
++void netfront_accel_ssr_fini(netfront_accel_vnic *vnic, 
++                           struct netfront_accel_ssr_state *st) {
++      struct netfront_accel_ssr_conn *c;
++
++      /* Return cleanly if efx_ssr_init() not previously called */
++      BUG_ON(list_valid(&st->conns) != list_valid(&st->free_conns));
++      if (! list_valid(&st->conns))
++              return;
++
++      while ( ! list_empty(&st->free_conns)) {
++              c = list_entry(st->free_conns.prev, 
++                             struct netfront_accel_ssr_conn, link);
++              list_del(&c->link);
++              BUG_ON(c->skb != NULL);
++              kfree(c);
++      }
++      while ( ! list_empty(&st->conns)) {
++              c = list_entry(st->conns.prev, 
++                             struct netfront_accel_ssr_conn, link);
++              list_del(&c->link);
++              if (c->skb)
++                      netfront_accel_ssr_deliver(vnic, st, c);
++              kfree(c);
++      }
++}
++
++
++/** Calc IP checksum and deliver to the OS
++ *
++ * @v st     The SSR state (per channel per port)
++ * @v c            The SSR connection state
++ */
++static void netfront_accel_ssr_deliver(netfront_accel_vnic *vnic,
++                                     struct netfront_accel_ssr_state *st,
++                                     struct netfront_accel_ssr_conn *c) {
++      BUG_ON(c->skb == NULL);
++
++      /*
++       * If we've chained packets together, recalculate the IP
++       * checksum.
++       */
++      if (skb_shinfo(c->skb)->frag_list) {
++              NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_bursts);
++              c->iph->check = 0;
++              c->iph->check = ip_fast_csum((unsigned char *) c->iph, 
++                                           c->iph->ihl);
++      }
++
++      VPRINTK("%s: %d\n", __FUNCTION__, c->skb->len);
++
++      netif_receive_skb(c->skb); 
++      c->skb = NULL;
++}
++
++
++/** Push held skbs down into network stack.
++ *
++ * @v st       SSR state
++ *
++ * Only called if we are tracking one or more connections.
++ */
++void __netfront_accel_ssr_end_of_burst(netfront_accel_vnic *vnic, 
++                                     struct netfront_accel_ssr_state *st) {
++      struct netfront_accel_ssr_conn *c;
++
++      BUG_ON(list_empty(&st->conns));
++
++      list_for_each_entry(c, &st->conns, link)
++              if (c->skb)
++                      netfront_accel_ssr_deliver(vnic, st, c);
++
++      /* Time-out connections that have received no traffic for 20ms. */
++      c = list_entry(st->conns.prev, struct netfront_accel_ssr_conn,
++                     link);
++      if (jiffies - c->last_pkt_jiffies > (HZ / 50 + 1)) {
++              NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_drop_stream);
++              list_del(&c->link);
++              list_add(&c->link, &st->free_conns);
++      }
++}
++
++
++/** Process SKB and decide whether to dispatch it to the stack now or
++ * later.
++ *
++ * @v st       SSR state
++ * @v skb     SKB to exmaine
++ * @ret rc       0 => deliver SKB to kernel now, otherwise the SKB belongs
++ *           us.
++ */
++int netfront_accel_ssr_skb(struct netfront_accel_vnic *vnic,
++                         struct netfront_accel_ssr_state *st,
++                         struct sk_buff *skb) {
++      int data_length, dont_merge;
++      struct netfront_accel_ssr_conn *c;
++      struct iphdr *iph;
++      struct tcphdr *th;
++      unsigned th_seq;
++
++      BUG_ON(skb_shinfo(skb)->frag_list != NULL);
++      BUG_ON(skb->next != NULL);
++
++      /* We're not interested if it isn't TCP over IPv4. */
++      iph = (struct iphdr *) skb->data;
++      if (skb->protocol != htons(ETH_P_IP) ||
++          iph->protocol != IPPROTO_TCP) {
++              return 0;
++      }
++
++      /* Ignore segments that fail csum or are fragmented. */
++      if (unlikely((skb->ip_summed - CHECKSUM_UNNECESSARY) |
++                   (iph->frag_off & htons(IP_MF | IP_OFFSET)))) {
++              return 0;
++      }
++
++      th = (struct tcphdr*)(skb->data + iph->ihl * 4);
++      data_length = ntohs(iph->tot_len) - iph->ihl * 4 - th->doff * 4;
++      th_seq = ntohl(th->seq);
++      dont_merge = (data_length == 0) | th->urg | th->syn | th->rst;
++
++      list_for_each_entry(c, &st->conns, link) {
++              if ((c->saddr  - iph->saddr) |
++                  (c->daddr  - iph->daddr) |
++                  (c->source - th->source) |
++                  (c->dest   - th->dest  ))
++                      continue;
++
++              /* Re-insert at head of list to reduce lookup time. */
++              list_del(&c->link);
++              list_add(&c->link, &st->conns);
++              c->last_pkt_jiffies = jiffies;
++
++              if (unlikely(th_seq - c->next_seq)) {
++                      /* Out-of-order, so start counting again. */
++                      if (c->skb)
++                              netfront_accel_ssr_deliver(vnic, st, c);
++                      c->n_in_order_pkts = 0;
++                      c->next_seq = th_seq + data_length;
++                      NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_misorder);
++                      return 0;
++              }
++              c->next_seq = th_seq + data_length;
++
++              if (++c->n_in_order_pkts < 300) {
++                      /* May be in slow-start, so don't merge. */
++                      NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_slow_start);
++                      return 0;
++              }
++
++              if (unlikely(dont_merge)) {
++                      if (c->skb)
++                              netfront_accel_ssr_deliver(vnic, st, c);
++                      return 0;
++              }
++
++              if (c->skb) {
++                      c->iph->tot_len = ntohs(c->iph->tot_len);
++                      c->iph->tot_len += data_length;
++                      c->iph->tot_len = htons(c->iph->tot_len);
++                      c->th->ack_seq = th->ack_seq;
++                      c->th->fin |= th->fin;
++                      c->th->psh |= th->psh;
++                      c->th->window = th->window;
++
++                      /* Remove the headers from this skb. */
++                      skb_pull(skb, skb->len - data_length);
++
++                      /*
++                       * Tack the new skb onto the head skb's frag_list.
++                       * This is exactly the format that fragmented IP
++                       * datagrams are reassembled into.
++                       */
++                      BUG_ON(skb->next != 0);
++                      if ( ! skb_shinfo(c->skb)->frag_list)
++                              skb_shinfo(c->skb)->frag_list = skb;
++                      else
++                              c->skb_tail->next = skb;
++                      c->skb_tail = skb;
++                      c->skb->len += skb->len;
++                      c->skb->data_len += skb->len;
++                      c->skb->truesize += skb->truesize;
++
++                      NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_merges);
++
++                      /*
++                       * If the next packet might push this super-packet
++                       * over the limit for an IP packet, deliver it now.
++                       * This is slightly conservative, but close enough.
++                       */
++                      if (c->skb->len + 
++                          (PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE)
++                          > 16384)
++                              netfront_accel_ssr_deliver(vnic, st, c);
++
++                      return 1;
++              }
++              else {
++                      c->iph = iph;
++                      c->th = th;
++                      c->skb = skb;
++                      return 1;
++              }
++      }
++
++      /* We're not yet tracking this connection. */
++
++      if (dont_merge) {
++              return 0;
++      }
++
++      if (list_empty(&st->free_conns)) {
++              c = list_entry(st->conns.prev, 
++                             struct netfront_accel_ssr_conn,
++                             link);
++              if (c->skb) {
++                      NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_too_many);
++                      return 0;
++              }
++      }
++      else {
++              c = list_entry(st->free_conns.next,
++                             struct netfront_accel_ssr_conn,
++                             link);
++      }
++      list_del(&c->link);
++      list_add(&c->link, &st->conns);
++      c->saddr = iph->saddr;
++      c->daddr = iph->daddr;
++      c->source = th->source;
++      c->dest = th->dest;
++      c->next_seq = th_seq + data_length;
++      c->n_in_order_pkts = 0;
++      BUG_ON(c->skb != NULL);
++      NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_new_stream);
++      return 0;
++}
diff --cc drivers/xen/sfc_netfront/accel_ssr.h

index 0000000,0000000..1d10f46

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_ssr.h
@@@ -1,0 -1,0 +1,88 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#ifndef NETFRONT_ACCEL_SSR_H
++#define NETFRONT_ACCEL_SSR_H
++
++#include <linux/skbuff.h>
++#include <linux/ip.h>
++#include <linux/tcp.h>
++#include <linux/list.h>
++
++#include "accel.h"
++
++/** State for Soft Segment Reassembly (SSR). */
++
++struct netfront_accel_ssr_conn {
++      struct list_head link;
++
++      unsigned saddr, daddr;
++      unsigned short source, dest;
++
++      /** Number of in-order packets we've seen with payload. */
++      unsigned n_in_order_pkts;
++
++      /** Next in-order sequence number. */
++      unsigned next_seq;
++
++      /** Time we last saw a packet on this connection. */
++      unsigned long last_pkt_jiffies;
++
++      /** The SKB we are currently holding.  If NULL, then all following
++       * fields are undefined.
++       */
++      struct sk_buff *skb;
++
++      /** The tail of the frag_list of SKBs we're holding.  Only valid
++       * after at least one merge.
++       */
++      struct sk_buff *skb_tail;
++
++      /** The IP header of the skb we are holding. */
++      struct iphdr *iph;
++      
++      /** The TCP header of the skb we are holding. */
++      struct tcphdr *th;
++};
++
++extern void netfront_accel_ssr_init(struct netfront_accel_ssr_state *st);
++extern void netfront_accel_ssr_fini(netfront_accel_vnic *vnic,
++                                  struct netfront_accel_ssr_state *st);
++
++extern void
++__netfront_accel_ssr_end_of_burst(netfront_accel_vnic *vnic,
++                                struct netfront_accel_ssr_state *st);
++
++extern int  netfront_accel_ssr_skb(netfront_accel_vnic *vnic,
++                                 struct netfront_accel_ssr_state *st,
++                                 struct sk_buff *skb);
++
++static inline void
++netfront_accel_ssr_end_of_burst (netfront_accel_vnic *vnic,
++                               struct netfront_accel_ssr_state *st) {
++      if ( ! list_empty(&st->conns) )
++              __netfront_accel_ssr_end_of_burst(vnic, st);
++}
++
++#endif /* NETFRONT_ACCEL_SSR_H */
diff --cc drivers/xen/sfc_netfront/accel_tso.c

index 0000000,0000000..004980f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_tso.c
@@@ -1,0 -1,0 +1,509 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <linux/pci.h>
++#include <linux/tcp.h>
++#include <linux/ip.h>
++#include <linux/in.h>
++#include <linux/if_ether.h>
++
++#include "accel.h"
++#include "accel_util.h"
++
++#include "accel_tso.h"
++
++#define ETH_HDR_LEN(skb)  skb_network_offset(skb)
++#define SKB_TCP_OFF(skb)  skb_transport_offset(skb)
++#define SKB_IP_OFF(skb)   skb_network_offset(skb)
++
++/*
++ * Set a maximum number of buffers in each output packet to make life
++ * a little simpler - if this is reached it will just move on to
++ * another packet 
++ */
++#define ACCEL_TSO_MAX_BUFFERS (6)
++
++/** TSO State.
++ *
++ * The state used during segmentation.  It is put into this data structure
++ * just to make it easy to pass into inline functions.
++ */
++struct netfront_accel_tso_state {
++      /** bytes of data we've yet to segment */
++      unsigned remaining_len;
++
++      /** current sequence number */
++      unsigned seqnum;
++
++      /** remaining space in current packet */
++      unsigned packet_space;
++
++      /** List of packets to be output, containing the buffers and
++       *  iovecs to describe each packet 
++       */
++      struct netfront_accel_tso_output_packet *output_packets;
++
++      /** Total number of buffers in output_packets */
++      unsigned buffers;
++
++      /** Total number of packets in output_packets */
++      unsigned packets;
++
++      /** Input Fragment Cursor.
++       *
++       * Where we are in the current fragment of the incoming SKB.  These
++       * values get updated in place when we split a fragment over
++       * multiple packets.
++       */
++      struct {
++              /** address of current position */
++              void *addr;
++              /** remaining length */   
++              unsigned int len;
++      } ifc; /*  == ifc Input Fragment Cursor */
++
++      /** Parameters.
++       *
++       * These values are set once at the start of the TSO send and do
++       * not get changed as the routine progresses.
++       */
++      struct {
++              /* the number of bytes of header */
++              unsigned int header_length;
++
++              /* The number of bytes to put in each outgoing segment. */
++              int full_packet_size;
++              
++              /* Current IP ID, host endian. */
++              unsigned ip_id;
++
++              /* Max size of each output packet payload */
++              int gso_size;
++      } p;
++};
++
++
++/**
++ * Verify that our various assumptions about sk_buffs and the conditions
++ * under which TSO will be attempted hold true.
++ *
++ * @v skb            The sk_buff to check.
++ */
++static inline void tso_check_safe(struct sk_buff *skb) {
++      EPRINTK_ON(skb->protocol != htons (ETH_P_IP));
++      EPRINTK_ON(((struct ethhdr*) skb->data)->h_proto != htons (ETH_P_IP));
++      EPRINTK_ON(ip_hdr(skb)->protocol != IPPROTO_TCP);
++      EPRINTK_ON((SKB_TCP_OFF(skb) + tcp_hdrlen(skb)) > skb_headlen(skb));
++}
++
++
++
++/** Parse the SKB header and initialise state. */
++static inline void tso_start(struct netfront_accel_tso_state *st, 
++                           struct sk_buff *skb) {
++
++      /*
++       * All ethernet/IP/TCP headers combined size is TCP header size
++       * plus offset of TCP header relative to start of packet.
++       */
++      st->p.header_length = tcp_hdrlen(skb) + SKB_TCP_OFF(skb);
++      st->p.full_packet_size = (st->p.header_length
++                                + skb_shinfo(skb)->gso_size);
++      st->p.gso_size = skb_shinfo(skb)->gso_size;
++
++      st->p.ip_id = htons(ip_hdr(skb)->id);
++      st->seqnum = ntohl(tcp_hdr(skb)->seq);
++
++      EPRINTK_ON(tcp_hdr(skb)->urg);
++      EPRINTK_ON(tcp_hdr(skb)->syn);
++      EPRINTK_ON(tcp_hdr(skb)->rst);
++
++      st->remaining_len = skb->len - st->p.header_length;
++
++      st->output_packets = NULL;
++      st->buffers = 0;
++      st->packets = 0;
++
++      VPRINTK("Starting new TSO: hl %d ps %d gso %d seq %x len %d\n",
++              st->p.header_length, st->p.full_packet_size, st->p.gso_size,
++              st->seqnum, skb->len);
++}
++
++/**
++ * Add another NIC mapped buffer onto an output packet  
++ */ 
++static inline int tso_start_new_buffer(netfront_accel_vnic *vnic,
++                                     struct netfront_accel_tso_state *st,
++                                     int first)
++{
++      struct netfront_accel_tso_buffer *tso_buf;
++      struct netfront_accel_pkt_desc *buf;
++
++      /* Get a mapped packet buffer */
++      buf = netfront_accel_buf_get(vnic->tx_bufs);
++      if (buf == NULL) {
++              DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
++              return -1;
++      }
++
++      /* Store a bit of meta-data at the end */
++      tso_buf =(struct netfront_accel_tso_buffer *)
++              (buf->pkt_kva + NETFRONT_ACCEL_TSO_BUF_LENGTH
++               + sizeof(struct netfront_accel_tso_output_packet));
++
++      tso_buf->buf = buf;
++
++      tso_buf->length = 0;
++      
++      if (first) {
++              struct netfront_accel_tso_output_packet *output_packet 
++                      = (struct netfront_accel_tso_output_packet *)
++                      (buf->pkt_kva + NETFRONT_ACCEL_TSO_BUF_LENGTH);
++              output_packet->next = st->output_packets;
++              st->output_packets = output_packet;
++              tso_buf->next = NULL;
++              st->output_packets->tso_bufs = tso_buf;
++              st->output_packets->tso_bufs_len = 1;
++      } else {
++              tso_buf->next = st->output_packets->tso_bufs;
++              st->output_packets->tso_bufs = tso_buf;
++              st->output_packets->tso_bufs_len ++;
++      }
++
++      BUG_ON(st->output_packets->tso_bufs_len > ACCEL_TSO_MAX_BUFFERS);
++      
++      st->buffers ++;
++
++      /*
++       * Store the context, set to NULL, last packet buffer will get
++       * non-NULL later
++       */
++      tso_buf->buf->skb = NULL;
++
++      return 0;
++}
++
++
++/* Generate a new header, and prepare for the new packet.
++ *
++ * @v vnic          VNIC
++ * @v skb            Socket buffer
++ * @v st              TSO state
++ * @ret rc          0 on success, or -1 if failed to alloc header
++ */
++
++static inline 
++int tso_start_new_packet(netfront_accel_vnic *vnic,
++                       struct sk_buff *skb,
++                       struct netfront_accel_tso_state *st) 
++{
++      struct netfront_accel_tso_buffer *tso_buf;
++      struct iphdr *tsoh_iph;
++      struct tcphdr *tsoh_th;
++      unsigned ip_length;
++
++      if (tso_start_new_buffer(vnic, st, 1) < 0) {
++              NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
++              return -1;              
++      }
++
++      /* This has been set up by tso_start_new_buffer() */
++      tso_buf = st->output_packets->tso_bufs;
++
++      /* Copy in the header */
++      memcpy(tso_buf->buf->pkt_kva, skb->data, st->p.header_length);
++      tso_buf->length = st->p.header_length;
++
++      tsoh_th = (struct tcphdr*) 
++              (tso_buf->buf->pkt_kva + SKB_TCP_OFF(skb));
++      tsoh_iph = (struct iphdr*) 
++              (tso_buf->buf->pkt_kva + SKB_IP_OFF(skb));
++
++      /* Set to zero to encourage falcon to fill these in */
++      tsoh_th->check  = 0;
++      tsoh_iph->check = 0;
++
++      tsoh_th->seq = htonl(st->seqnum);
++      st->seqnum += st->p.gso_size;
++
++      if (st->remaining_len > st->p.gso_size) {
++              /* This packet will not finish the TSO burst. */
++              ip_length = st->p.full_packet_size - ETH_HDR_LEN(skb);
++              tsoh_th->fin = 0;
++              tsoh_th->psh = 0;
++      } else {
++              /* This packet will be the last in the TSO burst. */
++              ip_length = (st->p.header_length - ETH_HDR_LEN(skb)
++                           + st->remaining_len);
++              tsoh_th->fin = tcp_hdr(skb)->fin;
++              tsoh_th->psh = tcp_hdr(skb)->psh;
++      }
++
++      tsoh_iph->tot_len = htons(ip_length);
++
++      /* Linux leaves suitable gaps in the IP ID space for us to fill. */
++      tsoh_iph->id = st->p.ip_id++;
++      tsoh_iph->id = htons(tsoh_iph->id);
++
++      st->packet_space = st->p.gso_size; 
++
++      st->packets++;
++
++      return 0;
++}
++
++
++
++static inline void tso_get_fragment(struct netfront_accel_tso_state *st, 
++                                  int len, void *addr)
++{
++      st->ifc.len = len;
++      st->ifc.addr = addr;
++      return;
++}
++
++
++static inline void tso_unwind(netfront_accel_vnic *vnic, 
++                            struct netfront_accel_tso_state *st)
++{
++      struct netfront_accel_tso_buffer *tso_buf;
++      struct netfront_accel_tso_output_packet *output_packet;
++
++      DPRINTK("%s\n", __FUNCTION__);
++
++      while (st->output_packets != NULL) {
++              output_packet = st->output_packets;
++              st->output_packets = output_packet->next;
++              while (output_packet->tso_bufs != NULL) {
++                      tso_buf = output_packet->tso_bufs;
++                      output_packet->tso_bufs = tso_buf->next;
++
++                      st->buffers --;
++                      output_packet->tso_bufs_len --;
++
++                      netfront_accel_buf_put(vnic->tx_bufs, 
++                                             tso_buf->buf->buf_id);
++              }
++      }
++      BUG_ON(st->buffers != 0);
++}
++
++
++
++static inline
++void tso_fill_packet_with_fragment(netfront_accel_vnic *vnic,
++                                 struct netfront_accel_tso_state *st) 
++{
++      struct netfront_accel_tso_buffer *tso_buf;
++      int n, space;
++
++      BUG_ON(st->output_packets == NULL);
++      BUG_ON(st->output_packets->tso_bufs == NULL);
++
++      tso_buf = st->output_packets->tso_bufs;
++
++      if (st->ifc.len == 0)  return;
++      if (st->packet_space == 0)  return;
++      if (tso_buf->length == NETFRONT_ACCEL_TSO_BUF_LENGTH) return;
++
++      n = min(st->ifc.len, st->packet_space);
++
++      space = NETFRONT_ACCEL_TSO_BUF_LENGTH - tso_buf->length;
++      n = min(n, space);
++
++      st->packet_space -= n;
++      st->remaining_len -= n;
++      st->ifc.len -= n;
++
++      memcpy(tso_buf->buf->pkt_kva + tso_buf->length, st->ifc.addr, n);
++
++      tso_buf->length += n;
++
++      BUG_ON(tso_buf->length > NETFRONT_ACCEL_TSO_BUF_LENGTH);
++
++      st->ifc.addr += n;
++
++      return;
++}
++
++
++int netfront_accel_enqueue_skb_tso(netfront_accel_vnic *vnic,
++                                 struct sk_buff *skb)
++{
++      struct netfront_accel_tso_state state;
++      struct netfront_accel_tso_buffer *tso_buf = NULL;
++      struct netfront_accel_tso_output_packet *reversed_list = NULL;
++      struct netfront_accel_tso_output_packet *tmp_pkt;
++      ef_iovec iovecs[ACCEL_TSO_MAX_BUFFERS];
++      int frag_i, rc, dma_id;
++      skb_frag_t *f;
++
++      tso_check_safe(skb);
++
++      if (skb->ip_summed != CHECKSUM_PARTIAL)
++              EPRINTK("Trying to TSO send a packet without HW checksum\n");
++
++      tso_start(&state, skb);
++
++      /*
++       * Setup the first payload fragment.  If the skb header area
++       * contains exactly the headers and all payload is in the frag
++       * list things are little simpler
++       */
++      if (skb_headlen(skb) == state.p.header_length) {
++              /* Grab the first payload fragment. */
++              BUG_ON(skb_shinfo(skb)->nr_frags < 1);
++              frag_i = 0;
++              f = &skb_shinfo(skb)->frags[frag_i];
++              tso_get_fragment(&state, f->size, 
++                               page_address(f->page) + f->page_offset);
++      } else {
++              int hl = state.p.header_length;
++              tso_get_fragment(&state,  skb_headlen(skb) - hl, 
++                               skb->data + hl);
++              frag_i = -1;
++      }
++
++      if (tso_start_new_packet(vnic, skb, &state) < 0) {
++              DPRINTK("%s: out of first start-packet memory\n",
++                      __FUNCTION__);
++              goto unwind;
++      }
++
++      while (1) {
++              tso_fill_packet_with_fragment(vnic, &state);
++              
++              /* Move onto the next fragment? */
++              if (state.ifc.len == 0) {
++                      if (++frag_i >= skb_shinfo(skb)->nr_frags)
++                              /* End of payload reached. */
++                              break;
++                      f = &skb_shinfo(skb)->frags[frag_i];
++                      tso_get_fragment(&state, f->size,
++                                       page_address(f->page) +
++                                       f->page_offset);
++              }
++
++              /* Start a new buffer? */
++              if ((state.output_packets->tso_bufs->length == 
++                   NETFRONT_ACCEL_TSO_BUF_LENGTH) &&
++                  tso_start_new_buffer(vnic, &state, 0)) {
++                      DPRINTK("%s: out of start-buffer memory\n",
++                              __FUNCTION__);
++                      goto unwind;
++              }
++
++              /* Start at new packet? */
++              if ((state.packet_space == 0 || 
++                   ((state.output_packets->tso_bufs_len >=
++                     ACCEL_TSO_MAX_BUFFERS) &&
++                    (state.output_packets->tso_bufs->length >= 
++                     NETFRONT_ACCEL_TSO_BUF_LENGTH))) &&
++                  tso_start_new_packet(vnic, skb, &state) < 0) {
++                      DPRINTK("%s: out of start-packet memory\n",
++                              __FUNCTION__);
++                      goto unwind;
++              }
++
++      }
++
++      /* Check for space */
++      if (ef_vi_transmit_space(&vnic->vi) < state.buffers) {
++              DPRINTK("%s: Not enough TX space (%d)\n",
++                      __FUNCTION__, state.buffers);
++              goto unwind;
++      }
++
++      /*
++       * Store the skb context in the most recent buffer (i.e. the
++       * last buffer that will be sent)
++       */
++      state.output_packets->tso_bufs->buf->skb = skb;
++
++      /* Reverse the list of packets as we construct it on a stack */
++      while (state.output_packets != NULL) {
++              tmp_pkt = state.output_packets;
++              state.output_packets = tmp_pkt->next;
++              tmp_pkt->next = reversed_list;
++              reversed_list = tmp_pkt;
++      }
++
++      /* Pass off to hardware */
++      while (reversed_list != NULL) {
++              tmp_pkt = reversed_list;
++              reversed_list = tmp_pkt->next;
++
++              BUG_ON(tmp_pkt->tso_bufs_len > ACCEL_TSO_MAX_BUFFERS);
++              BUG_ON(tmp_pkt->tso_bufs_len == 0);
++
++              dma_id = tmp_pkt->tso_bufs->buf->buf_id;
++
++              /*
++               * Make an iovec of the buffers in the list, reversing
++               * the buffers as we go as they are constructed on a
++               * stack
++               */
++              tso_buf = tmp_pkt->tso_bufs;
++              for (frag_i = tmp_pkt->tso_bufs_len - 1;
++                   frag_i >= 0;
++                   frag_i--) {
++                      iovecs[frag_i].iov_base = tso_buf->buf->pkt_buff_addr;
++                      iovecs[frag_i].iov_len = tso_buf->length;
++                      tso_buf = tso_buf->next;
++              }
++
++              rc = ef_vi_transmitv(&vnic->vi, iovecs, tmp_pkt->tso_bufs_len,
++                                   dma_id);
++              /*
++               * We checked for space already, so it really should
++               * succeed
++               */
++              BUG_ON(rc != 0);
++      }
++
++      /* Track number of tx fastpath stats */
++      vnic->netdev_stats.fastpath_tx_bytes += skb->len;
++      vnic->netdev_stats.fastpath_tx_pkts += state.packets;
++#if NETFRONT_ACCEL_STATS
++      {
++              unsigned n;
++              n = vnic->netdev_stats.fastpath_tx_pkts -
++                      vnic->stats.fastpath_tx_completions;
++              if (n > vnic->stats.fastpath_tx_pending_max)
++                      vnic->stats.fastpath_tx_pending_max = n;
++      }
++#endif
++
++      return NETFRONT_ACCEL_STATUS_GOOD;
++ 
++ unwind:
++      tso_unwind(vnic, &state);
++
++      NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
++
++      return NETFRONT_ACCEL_STATUS_BUSY;
++}
++
++
++
diff --cc drivers/xen/sfc_netfront/accel_tso.h

index 0000000,0000000..b9c3ca8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_tso.h
@@@ -1,0 -1,0 +1,57 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#ifndef NETFRONT_ACCEL_TSO_H
++#define NETFRONT_ACCEL_TSO_H
++
++#include "accel_bufs.h"
++
++/* Track the buffers used in each output packet */
++struct netfront_accel_tso_buffer {
++      struct netfront_accel_tso_buffer *next;
++      struct netfront_accel_pkt_desc *buf;
++      unsigned length;
++};
++
++/* Track the output packets formed from each input packet */
++struct netfront_accel_tso_output_packet {
++      struct netfront_accel_tso_output_packet *next;
++      struct netfront_accel_tso_buffer *tso_bufs;
++      unsigned tso_bufs_len;
++};
++
++
++/*
++ * Max available space in a buffer for data once meta-data has taken
++ * its place 
++ */
++#define NETFRONT_ACCEL_TSO_BUF_LENGTH                                 \
++      ((PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE)                     \
++       - sizeof(struct netfront_accel_tso_buffer)                     \
++       - sizeof(struct netfront_accel_tso_output_packet))
++
++int netfront_accel_enqueue_skb_tso(netfront_accel_vnic *vnic,
++                                 struct sk_buff *skb);
++
++#endif /* NETFRONT_ACCEL_TSO_H */
diff --cc drivers/xen/sfc_netfront/accel_vi.c

index 0000000,0000000..b1b6d70

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_vi.c
@@@ -1,0 -1,0 +1,1202 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <linux/if_ether.h>
++#include <linux/ip.h>
++#include <net/checksum.h>
++#include <asm/io.h>
++
++#include "accel.h"
++#include "accel_util.h"
++#include "accel_bufs.h"
++#include "accel_tso.h"
++#include "accel_ssr.h"
++#include "netfront.h"
++
++#include "etherfabric/ef_vi.h"
++
++/*
++ * Max available space in a buffer for data once meta-data has taken
++ * its place
++ */
++#define NETFRONT_ACCEL_TX_BUF_LENGTH                                  \
++      ((PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE)                     \
++       - sizeof(struct netfront_accel_tso_buffer))
++
++#define ACCEL_TX_MAX_BUFFERS (6)
++#define ACCEL_VI_POLL_EVENTS (8)
++
++static
++int netfront_accel_vi_init_fini(netfront_accel_vnic *vnic, 
++                              struct net_accel_msg_hw *hw_msg)
++{
++      struct ef_vi_nic_type nic_type;
++      struct net_accel_hw_falcon_b *hw_info;
++      void *io_kva, *evq_base, *rx_dma_kva, *tx_dma_kva, *doorbell_kva;
++      u32 *evq_gnts;
++      u32 evq_order;
++      int vi_state_size;
++      u8 vi_data[VI_MAPPINGS_SIZE];
++
++      if (hw_msg == NULL)
++              goto fini;
++
++      /* And create the local macs table lock */
++      spin_lock_init(&vnic->table_lock);
++      
++      /* Create fastpath table, initial size 8, key length 8 */
++      if (cuckoo_hash_init(&vnic->fastpath_table, 3, 8)) {
++              EPRINTK("failed to allocate fastpath table\n");
++              goto fail_cuckoo;
++      }
++
++      vnic->hw.falcon.type = hw_msg->type;
++
++      switch (hw_msg->type) {
++      case NET_ACCEL_MSG_HWTYPE_FALCON_A:
++              hw_info = &hw_msg->resources.falcon_a.common;
++              /* Need the extra rptr register page on A1 */
++              io_kva = net_accel_map_iomem_page
++                      (vnic->dev, hw_msg->resources.falcon_a.evq_rptr_gnt,
++                       &vnic->hw.falcon.evq_rptr_mapping);
++              if (io_kva == NULL) {
++                      EPRINTK("%s: evq_rptr permission failed\n", __FUNCTION__);
++                      goto evq_rptr_fail;
++              }
++
++              vnic->hw.falcon.evq_rptr = io_kva + 
++                      (hw_info->evq_rptr & (PAGE_SIZE - 1));
++              break;
++      case NET_ACCEL_MSG_HWTYPE_FALCON_B:
++      case NET_ACCEL_MSG_HWTYPE_SIENA_A:
++              hw_info = &hw_msg->resources.falcon_b;
++              break;
++      default:
++              goto bad_type;
++      }
++
++      /**** Event Queue ****/
++
++      /* Map the event queue pages */
++      evq_gnts = hw_info->evq_mem_gnts;
++      evq_order = hw_info->evq_order;
++
++      EPRINTK_ON(hw_info->evq_offs != 0);
++
++      DPRINTK("Will map evq %d pages\n", 1 << evq_order);
++
++      evq_base =
++              net_accel_map_grants_contig(vnic->dev, evq_gnts, 1 << evq_order,
++                                          &vnic->evq_mapping);
++      if (evq_base == NULL) {
++              EPRINTK("%s: evq_base failed\n", __FUNCTION__);
++              goto evq_fail;
++      }
++
++      /**** Doorbells ****/
++      /* Set up the doorbell mappings. */
++      doorbell_kva = 
++              net_accel_map_iomem_page(vnic->dev, hw_info->doorbell_gnt,
++                                       &vnic->hw.falcon.doorbell_mapping);
++      if (doorbell_kva == NULL) {
++              EPRINTK("%s: doorbell permission failed\n", __FUNCTION__);
++              goto doorbell_fail;
++      }
++      vnic->hw.falcon.doorbell = doorbell_kva;
++
++      /* On Falcon_B and Siena we get the rptr from the doorbell page */
++      if (hw_msg->type == NET_ACCEL_MSG_HWTYPE_FALCON_B ||
++          hw_msg->type == NET_ACCEL_MSG_HWTYPE_SIENA_A) {
++              vnic->hw.falcon.evq_rptr = 
++                      (u32 *)((char *)vnic->hw.falcon.doorbell 
++                              + hw_info->evq_rptr);
++      }
++
++      /**** DMA Queue ****/
++
++      /* Set up the DMA Queues from the message. */
++      tx_dma_kva = net_accel_map_grants_contig
++              (vnic->dev, &(hw_info->txdmaq_gnt), 1, 
++               &vnic->hw.falcon.txdmaq_mapping);
++      if (tx_dma_kva == NULL) {
++              EPRINTK("%s: TX dma failed\n", __FUNCTION__);
++              goto tx_dma_fail;
++      }
++
++      rx_dma_kva = net_accel_map_grants_contig
++              (vnic->dev, &(hw_info->rxdmaq_gnt), 1, 
++               &vnic->hw.falcon.rxdmaq_mapping);
++      if (rx_dma_kva == NULL) {
++              EPRINTK("%s: RX dma failed\n", __FUNCTION__);
++              goto rx_dma_fail;
++      }
++
++      /* Full confession */
++      DPRINTK("Mapped H/W"
++              "  Tx DMAQ grant %x -> %p\n"
++              "  Rx DMAQ grant %x -> %p\n"
++              "  EVQ grant %x -> %p\n",
++              hw_info->txdmaq_gnt, tx_dma_kva,
++              hw_info->rxdmaq_gnt, rx_dma_kva,
++              evq_gnts[0], evq_base
++              );
++
++      memset(vi_data, 0, sizeof(vi_data));
++      
++      /* TODO BUG11305: convert efhw_arch to ef_vi_arch
++       * e.g.
++       * arch = ef_vi_arch_from_efhw_arch(hw_info->nic_arch);
++       * assert(arch >= 0);
++       * nic_type.arch = arch;
++       */
++      nic_type.arch = (unsigned char)hw_info->nic_arch;
++      nic_type.variant = (char)hw_info->nic_variant;
++      nic_type.revision = (unsigned char)hw_info->nic_revision;
++      
++      ef_vi_init_mapping_evq(vi_data, nic_type, hw_info->instance, 
++                             1 << (evq_order + PAGE_SHIFT), evq_base, 
++                             (void *)0xdeadbeef);
++
++      ef_vi_init_mapping_vi(vi_data, nic_type, hw_info->rx_capacity, 
++                            hw_info->tx_capacity, hw_info->instance, 
++                            doorbell_kva, rx_dma_kva, tx_dma_kva, 0);
++
++      vi_state_size = ef_vi_calc_state_bytes(hw_info->rx_capacity,
++                                             hw_info->tx_capacity);
++      vnic->vi_state = (ef_vi_state *)kmalloc(vi_state_size, GFP_KERNEL);
++      if (vnic->vi_state == NULL) {
++              EPRINTK("%s: kmalloc for VI state failed\n", __FUNCTION__);
++              goto vi_state_fail;
++      }
++      ef_vi_init(&vnic->vi, vi_data, vnic->vi_state, &vnic->evq_state, 0);
++
++      ef_eventq_state_init(&vnic->vi);
++
++      ef_vi_state_init(&vnic->vi);
++
++      return 0;
++
++fini:
++      kfree(vnic->vi_state);
++      vnic->vi_state = NULL;
++vi_state_fail:
++      net_accel_unmap_grants_contig(vnic->dev, vnic->hw.falcon.rxdmaq_mapping);
++rx_dma_fail:
++      net_accel_unmap_grants_contig(vnic->dev, vnic->hw.falcon.txdmaq_mapping);
++tx_dma_fail:
++      net_accel_unmap_iomem_page(vnic->dev, vnic->hw.falcon.doorbell_mapping);
++      vnic->hw.falcon.doorbell = NULL;
++doorbell_fail:
++      net_accel_unmap_grants_contig(vnic->dev, vnic->evq_mapping);
++evq_fail:
++      if (vnic->hw.falcon.type == NET_ACCEL_MSG_HWTYPE_FALCON_A)
++              net_accel_unmap_iomem_page(vnic->dev, 
++                                         vnic->hw.falcon.evq_rptr_mapping);
++      vnic->hw.falcon.evq_rptr = NULL;
++evq_rptr_fail:
++bad_type:
++      cuckoo_hash_destroy(&vnic->fastpath_table);
++fail_cuckoo:
++      return -EIO;
++}
++
++
++void netfront_accel_vi_ctor(netfront_accel_vnic *vnic)
++{
++      /* Just mark the VI as uninitialised. */
++      vnic->vi_state = NULL;
++}
++
++
++int netfront_accel_vi_init(netfront_accel_vnic *vnic, struct net_accel_msg_hw *hw_msg)
++{
++      BUG_ON(hw_msg == NULL);
++      return netfront_accel_vi_init_fini(vnic, hw_msg);
++}
++
++
++void netfront_accel_vi_dtor(netfront_accel_vnic *vnic)
++{
++      if (vnic->vi_state != NULL)
++              netfront_accel_vi_init_fini(vnic, NULL);
++}
++
++
++static
++void netfront_accel_vi_post_rx(netfront_accel_vnic *vnic, u16 id,
++                             netfront_accel_pkt_desc *buf)
++{
++
++      int idx = vnic->rx_dma_batched;
++
++#if 0
++      VPRINTK("Posting buffer %d (0x%08x) for rx at index %d, space is %d\n",
++              id, buf->pkt_buff_addr, idx, ef_vi_receive_space(&vnic->vi));
++#endif
++      /* Set up a virtual buffer descriptor */
++      ef_vi_receive_init(&vnic->vi, buf->pkt_buff_addr, id,
++                         /*rx_bytes=max*/0);
++
++      idx++;
++
++      vnic->rx_dma_level++;
++      
++      /* 
++       * Only push the descriptor to the card if we've reached the
++       * batch size.  Otherwise, the descriptors can sit around for
++       * a while.  There will be plenty available.
++       */
++      if (idx >= NETFRONT_ACCEL_RX_DESC_BATCH ||
++          vnic->rx_dma_level < NETFRONT_ACCEL_RX_DESC_BATCH) {
++#if 0
++              VPRINTK("Flushing %d rx descriptors.\n", idx);
++#endif
++
++              /* Push buffer to hardware */
++              ef_vi_receive_push(&vnic->vi);
++              
++              idx = 0;
++      }
++      
++      vnic->rx_dma_batched = idx;
++}
++
++
++inline
++void netfront_accel_vi_post_rx_or_free(netfront_accel_vnic *vnic, u16 id,
++                                     netfront_accel_pkt_desc *buf)
++{
++
++      VPRINTK("%s: %d\n", __FUNCTION__, id);
++
++      if (ef_vi_receive_space(&vnic->vi) <= vnic->rx_dma_batched) {
++              VPRINTK("RX space is full\n");
++              netfront_accel_buf_put(vnic->rx_bufs, id);
++              return;
++      }
++
++      VPRINTK("Completed buffer %d is reposted\n", id);
++      netfront_accel_vi_post_rx(vnic, id, buf);
++      
++      /*
++       * Let's see if there's any more to be pushed out to the NIC
++       * while we're here
++       */
++      while (ef_vi_receive_space(&vnic->vi) > vnic->rx_dma_batched) {
++              /* Try to allocate a buffer. */
++              buf = netfront_accel_buf_get(vnic->rx_bufs);
++              if (buf == NULL)
++                      break;
++              
++              /* Add it to the rx dma queue. */
++              netfront_accel_vi_post_rx(vnic, buf->buf_id, buf);      
++      }
++}
++
++
++void netfront_accel_vi_add_bufs(netfront_accel_vnic *vnic, int is_rx)
++{
++
++      while (is_rx && 
++             ef_vi_receive_space(&vnic->vi) > vnic->rx_dma_batched) {
++              netfront_accel_pkt_desc *buf;
++              
++              VPRINTK("%s: %d\n", __FUNCTION__, vnic->rx_dma_level);
++              
++              /* Try to allocate a buffer. */
++              buf = netfront_accel_buf_get(vnic->rx_bufs);
++
++              if (buf == NULL)
++                      break;
++              
++              /* Add it to the rx dma queue. */
++              netfront_accel_vi_post_rx(vnic, buf->buf_id, buf);
++      }
++
++      VPRINTK("%s: done\n", __FUNCTION__);
++}
++
++
++struct netfront_accel_multi_state {
++      unsigned remaining_len;
++
++      unsigned buffers;
++
++      struct netfront_accel_tso_buffer *output_buffers;
++
++      /* Where we are in the current fragment of the SKB. */
++      struct {
++              /* address of current position */
++              void *addr;
++              /* remaining length */    
++              unsigned int len;
++      } ifc; /*  == Input Fragment Cursor */
++};
++
++
++static inline void multi_post_start(struct netfront_accel_multi_state *st, 
++                                  struct sk_buff *skb)
++{
++      st->remaining_len = skb->len;
++      st->output_buffers = NULL;
++      st->buffers = 0;
++      st->ifc.len = skb_headlen(skb);
++      st->ifc.addr = skb->data;
++}
++
++static int multi_post_start_new_buffer(netfront_accel_vnic *vnic, 
++                                     struct netfront_accel_multi_state *st)
++{
++      struct netfront_accel_tso_buffer *tso_buf;
++      struct netfront_accel_pkt_desc *buf;
++
++      /* Get a mapped packet buffer */
++      buf = netfront_accel_buf_get(vnic->tx_bufs);
++      if (buf == NULL) {
++              DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
++              return -1;
++      }
++
++      /* Store a bit of meta-data at the end */
++      tso_buf = (struct netfront_accel_tso_buffer *)
++              (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
++
++      tso_buf->buf = buf;
++
++      tso_buf->length = 0;
++      
++      tso_buf->next = st->output_buffers;
++      st->output_buffers = tso_buf;
++      st->buffers++;
++
++      BUG_ON(st->buffers >= ACCEL_TX_MAX_BUFFERS);
++
++      /*
++       * Store the context, set to NULL, last packet buffer will get
++       * non-NULL later
++       */
++      tso_buf->buf->skb = NULL;
++      
++      return 0;
++}
++
++
++static void
++multi_post_fill_buffer_with_fragment(netfront_accel_vnic *vnic,
++                                   struct netfront_accel_multi_state *st)
++{
++      struct netfront_accel_tso_buffer *tso_buf;
++      unsigned n, space;
++
++      BUG_ON(st->output_buffers == NULL);
++      tso_buf = st->output_buffers;
++
++      if (st->ifc.len == 0) return;
++      if (tso_buf->length == NETFRONT_ACCEL_TX_BUF_LENGTH) return;
++
++      BUG_ON(tso_buf->length > NETFRONT_ACCEL_TX_BUF_LENGTH);
++
++      space = NETFRONT_ACCEL_TX_BUF_LENGTH - tso_buf->length;
++      n = min(st->ifc.len, space);
++
++      memcpy(tso_buf->buf->pkt_kva + tso_buf->length, st->ifc.addr, n);
++
++      st->remaining_len -= n;
++      st->ifc.len -= n;
++      tso_buf->length += n;
++      st->ifc.addr += n;
++
++      BUG_ON(tso_buf->length > NETFRONT_ACCEL_TX_BUF_LENGTH);
++
++      return;
++}
++
++
++static inline void multi_post_unwind(netfront_accel_vnic *vnic,
++                                   struct netfront_accel_multi_state *st)
++{
++      struct netfront_accel_tso_buffer *tso_buf;
++
++      DPRINTK("%s\n", __FUNCTION__);
++
++      while (st->output_buffers != NULL) {
++              tso_buf = st->output_buffers;
++              st->output_buffers = tso_buf->next;
++              st->buffers--;
++              netfront_accel_buf_put(vnic->tx_bufs, tso_buf->buf->buf_id);
++      }
++      BUG_ON(st->buffers != 0);
++}
++
++
++static enum netfront_accel_post_status
++netfront_accel_enqueue_skb_multi(netfront_accel_vnic *vnic, struct sk_buff *skb)
++{
++      struct netfront_accel_tso_buffer *tso_buf;
++      struct netfront_accel_multi_state state;
++      ef_iovec iovecs[ACCEL_TX_MAX_BUFFERS];
++      skb_frag_t *f;
++      int frag_i, rc, dma_id;
++
++      multi_post_start(&state, skb);
++
++      frag_i = -1;
++
++      if (skb->ip_summed == CHECKSUM_PARTIAL) {
++              /* Set to zero to encourage falcon to work it out for us */
++              *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
++      }
++
++      if (multi_post_start_new_buffer(vnic, &state)) {
++              DPRINTK("%s: out of buffers\n", __FUNCTION__);
++              goto unwind;
++      }
++
++      while (1) {
++              multi_post_fill_buffer_with_fragment(vnic, &state);
++
++              /* Move onto the next fragment? */
++              if (state.ifc.len == 0) {
++                      if (++frag_i >= skb_shinfo(skb)->nr_frags)
++                              /* End of payload reached. */
++                              break;
++                      f = &skb_shinfo(skb)->frags[frag_i];
++                      state.ifc.len = f->size;
++                      state.ifc.addr = page_address(f->page) + f->page_offset;
++              }
++
++              /* Start a new buffer? */
++              if ((state.output_buffers->length == 
++                   NETFRONT_ACCEL_TX_BUF_LENGTH) &&
++                  multi_post_start_new_buffer(vnic, &state)) {
++                      DPRINTK("%s: out of buffers\n", __FUNCTION__);
++                      goto unwind;
++              }
++      }
++
++      /* Check for space */
++      if (ef_vi_transmit_space(&vnic->vi) < state.buffers) {
++              DPRINTK("%s: Not enough TX space (%d)\n", __FUNCTION__, state.buffers);
++              goto unwind;
++      }
++
++      /* Store the skb in what will be the last buffer's context */
++      state.output_buffers->buf->skb = skb;
++      /* Remember dma_id of what will be the last buffer */ 
++      dma_id = state.output_buffers->buf->buf_id;
++
++      /*
++       * Make an iovec of the buffers in the list, reversing the
++       * buffers as we go as they are constructed on a stack
++       */
++      tso_buf = state.output_buffers;
++      for (frag_i = state.buffers-1; frag_i >= 0; frag_i--) {
++              iovecs[frag_i].iov_base = tso_buf->buf->pkt_buff_addr;
++              iovecs[frag_i].iov_len = tso_buf->length;
++              tso_buf = tso_buf->next;
++      }
++      
++      rc = ef_vi_transmitv(&vnic->vi, iovecs, state.buffers, dma_id);
++      /* We checked for space already, so it really should succeed */
++      BUG_ON(rc != 0);
++
++      /* Track number of tx fastpath stats */
++      vnic->netdev_stats.fastpath_tx_bytes += skb->len;
++      vnic->netdev_stats.fastpath_tx_pkts ++;
++#if NETFRONT_ACCEL_STATS
++      {
++              u32 n;
++              n = vnic->netdev_stats.fastpath_tx_pkts -
++                      (u32)vnic->stats.fastpath_tx_completions;
++              if (n > vnic->stats.fastpath_tx_pending_max)
++                      vnic->stats.fastpath_tx_pending_max = n;
++      }
++#endif
++      return NETFRONT_ACCEL_STATUS_GOOD;
++
++unwind:
++      multi_post_unwind(vnic, &state);
++
++      NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
++
++      return NETFRONT_ACCEL_STATUS_BUSY;
++}
++
++
++static enum netfront_accel_post_status 
++netfront_accel_enqueue_skb_single(netfront_accel_vnic *vnic, struct sk_buff *skb)
++{
++      struct netfront_accel_tso_buffer *tso_buf;
++      struct netfront_accel_pkt_desc *buf;
++      u8 *kva;
++      int rc;
++
++      if (ef_vi_transmit_space(&vnic->vi) < 1) {
++              DPRINTK("%s: No TX space\n", __FUNCTION__);
++              NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
++              return NETFRONT_ACCEL_STATUS_BUSY;
++      }
++
++      buf = netfront_accel_buf_get(vnic->tx_bufs);
++      if (buf == NULL) {
++              DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
++              NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
++              return NETFRONT_ACCEL_STATUS_BUSY;
++      }
++
++      /* Track number of tx fastpath stats */
++      vnic->netdev_stats.fastpath_tx_pkts++;
++      vnic->netdev_stats.fastpath_tx_bytes += skb->len;
++
++#if NETFRONT_ACCEL_STATS
++      {
++              u32 n;
++              n = vnic->netdev_stats.fastpath_tx_pkts - 
++                      (u32)vnic->stats.fastpath_tx_completions;
++              if (n > vnic->stats.fastpath_tx_pending_max)
++                      vnic->stats.fastpath_tx_pending_max = n;
++      }
++#endif
++      
++      /* Store the context */
++      buf->skb = skb;
++      
++      kva = buf->pkt_kva;
++
++      if (skb->ip_summed == CHECKSUM_PARTIAL) {
++              /* Set to zero to encourage falcon to work it out for us */
++              *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
++      }
++      NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT
++              (skb, idx, frag_data, frag_len, {
++                      /* Copy in payload */
++                      VPRINTK("*** Copying %d bytes to %p\n", frag_len, kva);
++                      memcpy(kva, frag_data, frag_len);
++                      kva += frag_len;
++              });
++
++      VPRINTK("%s: id %d pkt %p kva %p buff_addr 0x%08x\n", __FUNCTION__,
++              buf->buf_id, buf, buf->pkt_kva, buf->pkt_buff_addr);
++
++
++      /* Set up the TSO meta-data for a single buffer/packet */
++      tso_buf = (struct netfront_accel_tso_buffer *)
++              (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
++      tso_buf->next = NULL;
++      tso_buf->buf = buf;
++      tso_buf->length = skb->len;
++
++      rc = ef_vi_transmit(&vnic->vi, buf->pkt_buff_addr, skb->len,
++                          buf->buf_id);
++      /* We checked for space already, so it really should succeed */
++      BUG_ON(rc != 0);
++
++      return NETFRONT_ACCEL_STATUS_GOOD;
++}
++
++
++enum netfront_accel_post_status 
++netfront_accel_vi_tx_post(netfront_accel_vnic *vnic, struct sk_buff *skb)
++{
++      struct ethhdr *pkt_eth_hdr;
++      struct iphdr *pkt_ipv4_hdr;
++      int value, try_fastpath;
++
++      /*
++       * This assumes that the data field points to the dest mac
++       * address.
++       */
++      cuckoo_hash_mac_key key = cuckoo_mac_to_key(skb->data);
++
++      /*
++       * NB very important that all things that could return "CANT"
++       * are tested before things that return "BUSY" as if it it
++       * returns "BUSY" it is assumed that it won't return "CANT"
++       * next time it is tried
++       */
++
++      /*
++       * Do a fastpath send if fast path table lookup returns true.
++       * We do this without the table lock and so may get the wrong
++       * answer, but current opinion is that's not a big problem 
++       */
++      try_fastpath = cuckoo_hash_lookup(&vnic->fastpath_table, 
++                                        (cuckoo_hash_key *)(&key), &value);
++
++      if (!try_fastpath) {
++              VPRINTK("try fast path false for mac: %pM\n", skb->data);
++              
++              return NETFRONT_ACCEL_STATUS_CANT;
++      }
++
++      /* Check to see if the packet can be sent. */
++      if (skb_headlen(skb) < sizeof(*pkt_eth_hdr) + sizeof(*pkt_ipv4_hdr)) {
++              EPRINTK("%s: Packet header is too small\n", __FUNCTION__);
++              return NETFRONT_ACCEL_STATUS_CANT;
++      }
++
++      pkt_eth_hdr  = (void*)skb->data;
++      pkt_ipv4_hdr = (void*)(pkt_eth_hdr+1);
++
++      if (be16_to_cpu(pkt_eth_hdr->h_proto) != ETH_P_IP) {
++              DPRINTK("%s: Packet is not IPV4 (ether_type=0x%04x)\n", __FUNCTION__,
++                      be16_to_cpu(pkt_eth_hdr->h_proto));
++              return NETFRONT_ACCEL_STATUS_CANT;
++      }
++      
++      if (pkt_ipv4_hdr->protocol != IPPROTO_TCP &&
++          pkt_ipv4_hdr->protocol != IPPROTO_UDP) {
++              DPRINTK("%s: Packet is not TCP/UDP (ip_protocol=0x%02x)\n",
++                      __FUNCTION__, pkt_ipv4_hdr->protocol);
++              return NETFRONT_ACCEL_STATUS_CANT;
++      }
++      
++      VPRINTK("%s: %d bytes, gso %d\n", __FUNCTION__, skb->len, 
++              skb_shinfo(skb)->gso_size);
++      
++      if (skb_shinfo(skb)->gso_size) {
++              return netfront_accel_enqueue_skb_tso(vnic, skb);
++      }
++
++      if (skb->len <= NETFRONT_ACCEL_TX_BUF_LENGTH) {
++              return netfront_accel_enqueue_skb_single(vnic, skb);
++      }
++
++      return netfront_accel_enqueue_skb_multi(vnic, skb);
++}
++
++
++/*
++ * Copy the data to required end destination. NB. len is the total new
++ * length of the socket buffer, not the amount of data to copy
++ */
++inline
++int ef_vnic_copy_to_skb(netfront_accel_vnic *vnic, struct sk_buff *skb, 
++                      struct netfront_accel_pkt_desc *buf, int len)
++{
++      int i, extra = len - skb->len;
++      char c;
++      int pkt_stride = vnic->rx_pkt_stride;
++      int skb_stride = vnic->rx_skb_stride;
++      char *skb_start;
++      
++      /*
++       * This pulls stuff into the cache - have seen performance
++       * benefit in this, but disabled by default
++       */
++      skb_start = skb->data;
++      if (pkt_stride) {
++              for (i = 0; i < len; i += pkt_stride) {
++                      c += ((volatile char*)(buf->pkt_kva))[i];
++              }
++      }
++      if (skb_stride) {
++              for (i = skb->len; i < len ; i += skb_stride) {
++                      c += ((volatile char*)(skb_start))[i];
++              }
++      }
++
++      if (skb_tailroom(skb) >= extra) {
++              memcpy(skb_put(skb, extra), buf->pkt_kva, extra);
++              return 0;
++      }
++
++      return -ENOSPC;
++}
++
++
++static void discard_jumbo_state(netfront_accel_vnic *vnic) 
++{
++
++      if (vnic->jumbo_state.skb != NULL) {
++              dev_kfree_skb_any(vnic->jumbo_state.skb);
++
++              vnic->jumbo_state.skb = NULL;
++      }
++      vnic->jumbo_state.in_progress = 0;
++}
++
++
++static void  netfront_accel_vi_rx_complete(netfront_accel_vnic *vnic,
++                                         struct sk_buff *skb)
++{
++      cuckoo_hash_mac_key key;
++      unsigned long flags;
++      int value;
++      struct net_device *net_dev;
++
++
++      key = cuckoo_mac_to_key(skb->data + ETH_ALEN);
++
++      /*
++       * If this is a MAC address that we want to do fast path TX
++       * to, and we don't already, add it to the fastpath table.
++       * The initial lookup is done without the table lock and so
++       * may get the wrong answer, but current opinion is that's not
++       * a big problem
++       */
++      if (is_valid_ether_addr(skb->data + ETH_ALEN) &&
++          !cuckoo_hash_lookup(&vnic->fastpath_table, (cuckoo_hash_key *)&key,
++                              &value)) {
++              spin_lock_irqsave(&vnic->table_lock, flags);
++                 
++              cuckoo_hash_add_check(&vnic->fastpath_table,
++                                    (cuckoo_hash_key *)&key,
++                                    1, 1);
++              
++              spin_unlock_irqrestore(&vnic->table_lock, flags);
++      }
++
++      if (compare_ether_addr(skb->data, vnic->mac)) {
++              struct iphdr *ip = (struct iphdr *)(skb->data + ETH_HLEN);
++              u16 port;
++
++              DPRINTK("%s: saw wrong MAC address %pM\n",
++                      __FUNCTION__, skb->data);
++
++              if (ip->protocol == IPPROTO_TCP) {
++                      struct tcphdr *tcp = (struct tcphdr *)
++                              ((char *)ip + 4 * ip->ihl);
++                      port = tcp->dest;
++              } else {
++                      struct udphdr *udp = (struct udphdr *)
++                              ((char *)ip + 4 * ip->ihl);
++                      EPRINTK_ON(ip->protocol != IPPROTO_UDP);
++                      port = udp->dest;
++              }
++
++              netfront_accel_msg_tx_fastpath(vnic, skb->data,
++                                             ip->daddr, port,
++                                             ip->protocol);
++      }
++
++      net_dev = vnic->net_dev;
++      skb->protocol = eth_type_trans(skb, net_dev);
++      /* CHECKSUM_UNNECESSARY as hardware has done it already */
++      skb->ip_summed = CHECKSUM_UNNECESSARY;
++
++      if (!netfront_accel_ssr_skb(vnic, &vnic->ssr_state, skb))
++              netif_receive_skb(skb);
++}
++
++
++static int netfront_accel_vi_poll_process_rx(netfront_accel_vnic *vnic, 
++                                           ef_event *ev)
++{
++      struct netfront_accel_bufinfo *bufinfo = vnic->rx_bufs;
++      struct netfront_accel_pkt_desc *buf = NULL;
++      struct sk_buff *skb;
++      int id, len, sop = 0, cont = 0;
++
++      VPRINTK("Rx event.\n");
++      /*
++       * Complete the receive operation, and get the request id of
++       * the buffer
++       */
++      id = ef_vi_receive_done(&vnic->vi, ev);
++
++      if (id < 0 || id >= bufinfo->npages*NETFRONT_ACCEL_BUFS_PER_PAGE) {
++              EPRINTK("Rx packet %d is invalid\n", id);
++              /* Carry on round the loop if more events */
++              goto bad_packet;
++      }
++      /* Get our buffer descriptor */
++      buf = netfront_accel_buf_find(bufinfo, id);
++
++      len = EF_EVENT_RX_BYTES(*ev);
++
++      /* An RX buffer has been removed from the DMA ring. */
++      vnic->rx_dma_level--;
++
++      if (EF_EVENT_TYPE(*ev) == EF_EVENT_TYPE_RX) {
++              sop = EF_EVENT_RX_SOP(*ev);
++              cont = EF_EVENT_RX_CONT(*ev);
++
++              skb = vnic->jumbo_state.skb;
++
++              VPRINTK("Rx packet %d: %d bytes so far; sop %d; cont %d\n", 
++                      id, len, sop, cont);
++
++              if (sop) {
++                      if (!vnic->jumbo_state.in_progress) {
++                              vnic->jumbo_state.in_progress = 1;
++                              BUG_ON(vnic->jumbo_state.skb != NULL);
++                      } else {
++                              /*
++                               * This fragment shows a missing tail in 
++                               * previous one, but is itself possibly OK
++                               */
++                              DPRINTK("sop and in_progress => no tail\n");
++
++                              /* Release the socket buffer we already had */
++                              discard_jumbo_state(vnic);
++
++                              /* Now start processing this fragment */
++                              vnic->jumbo_state.in_progress = 1;
++                              skb = NULL;
++                      }
++              } else if (!vnic->jumbo_state.in_progress) {
++                      DPRINTK("!sop and !in_progress => missing head\n");
++                      goto missing_head;
++              }
++
++              if (!cont) {
++                      /* Update state for next time */
++                      vnic->jumbo_state.in_progress = 0;
++                      vnic->jumbo_state.skb = NULL;
++              } else if (!vnic->jumbo_state.in_progress) {
++                      DPRINTK("cont and !in_progress => missing head\n");
++                      goto missing_head;
++              }
++
++              if (skb == NULL) {
++                      BUG_ON(!sop);
++
++                      if (!cont)
++                              skb = alloc_skb(len+NET_IP_ALIGN, GFP_ATOMIC);
++                      else
++                              skb = alloc_skb(vnic->net_dev->mtu+NET_IP_ALIGN, 
++                                              GFP_ATOMIC);
++
++                      if (skb == NULL) {
++                              DPRINTK("%s: Couldn't get an rx skb.\n",
++                                      __FUNCTION__);
++                              netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
++                              /*
++                               * Dropping this fragment means we
++                               * should discard the rest too
++                               */
++                              discard_jumbo_state(vnic);
++
++                              /* Carry on round the loop if more events */
++                              return 0;
++                      }
++
++              }
++              
++              /* Copy the data to required end destination */
++              if (ef_vnic_copy_to_skb(vnic, skb, buf, len) != 0) {
++                      /*
++                       * No space in the skb - suggests > MTU packet
++                       * received
++                       */
++                      EPRINTK("%s: Rx packet too large (%d)\n",
++                              __FUNCTION__, len);
++                      netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
++                      discard_jumbo_state(vnic);
++                      return 0;
++              }
++              
++              /* Put the buffer back in the DMA queue. */
++              netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
++
++              if (cont) {
++                      vnic->jumbo_state.skb = skb;
++
++                      return 0;
++              } else {
++                      /* Track number of rx fastpath packets */
++                      vnic->netdev_stats.fastpath_rx_pkts++;
++                      vnic->netdev_stats.fastpath_rx_bytes += len;
++
++                      netfront_accel_vi_rx_complete(vnic, skb);
++
++                      return 1;
++              }
++      } else {
++              BUG_ON(EF_EVENT_TYPE(*ev) != EF_EVENT_TYPE_RX_DISCARD);
++
++              if (EF_EVENT_RX_DISCARD_TYPE(*ev) 
++                  == EF_EVENT_RX_DISCARD_TRUNC) {
++                      DPRINTK("%s: " EF_EVENT_FMT 
++                              " buffer %d FRM_TRUNC q_id %d\n",
++                              __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
++                              EF_EVENT_RX_DISCARD_Q_ID(*ev) );
++                      NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_frm_trunc);
++              } else if (EF_EVENT_RX_DISCARD_TYPE(*ev) 
++                        == EF_EVENT_RX_DISCARD_OTHER) {
++                      DPRINTK("%s: " EF_EVENT_FMT 
++                              " buffer %d RX_DISCARD_OTHER q_id %d\n",
++                              __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
++                              EF_EVENT_RX_DISCARD_Q_ID(*ev) );
++                      NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_discard_other);
++              } else if (EF_EVENT_RX_DISCARD_TYPE(*ev) ==
++                         EF_EVENT_RX_DISCARD_CSUM_BAD) {
++                      DPRINTK("%s: " EF_EVENT_FMT 
++                              " buffer %d DISCARD CSUM_BAD q_id %d\n",
++                              __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
++                              EF_EVENT_RX_DISCARD_Q_ID(*ev) );
++                      NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_csum_bad);
++              } else if (EF_EVENT_RX_DISCARD_TYPE(*ev) ==
++                         EF_EVENT_RX_DISCARD_CRC_BAD) {
++                      DPRINTK("%s: " EF_EVENT_FMT 
++                              " buffer %d DISCARD CRC_BAD q_id %d\n",
++                              __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
++                              EF_EVENT_RX_DISCARD_Q_ID(*ev) );
++                      NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_crc_bad);
++              } else {
++                      BUG_ON(EF_EVENT_RX_DISCARD_TYPE(*ev) !=
++                             EF_EVENT_RX_DISCARD_RIGHTS);
++                      DPRINTK("%s: " EF_EVENT_FMT 
++                              " buffer %d DISCARD RIGHTS q_id %d\n",
++                              __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
++                              EF_EVENT_RX_DISCARD_Q_ID(*ev) );
++                      NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_rights_bad);
++              }
++      }
++
++      /* discard type drops through here */
++
++bad_packet:
++      /* Release the socket buffer we already had */
++      discard_jumbo_state(vnic);
++
++missing_head:
++      BUG_ON(vnic->jumbo_state.in_progress != 0);
++      BUG_ON(vnic->jumbo_state.skb != NULL);
++
++      if (id >= 0 && id < bufinfo->npages*NETFRONT_ACCEL_BUFS_PER_PAGE)
++              /* Put the buffer back in the DMA queue. */
++              netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
++
++      vnic->netdev_stats.fastpath_rx_errors++;
++
++      DPRINTK("%s experienced bad packet/missing fragment error: %d \n",
++              __FUNCTION__, ev->rx.flags);
++
++      return 0;
++}
++
++
++static void netfront_accel_vi_not_busy(netfront_accel_vnic *vnic)
++{
++      struct netfront_info *np = ((struct netfront_info *)
++                                  netdev_priv(vnic->net_dev));
++      int handled;
++      unsigned long flags;
++
++      /*
++       * We hold the vnic tx_lock which is sufficient to exclude
++       * writes to tx_skb
++       */
++
++      if (vnic->tx_skb != NULL) {
++              DPRINTK("%s trying to send spare buffer\n", __FUNCTION__);
++              
++              handled = netfront_accel_vi_tx_post(vnic, vnic->tx_skb);
++              
++              if (handled != NETFRONT_ACCEL_STATUS_BUSY) {
++                      DPRINTK("%s restarting tx\n", __FUNCTION__);
++
++                      /* Need netfront tx_lock and vnic tx_lock to
++                       * write tx_skb */
++                      spin_lock_irqsave(&np->tx_lock, flags);
++
++                      vnic->tx_skb = NULL;
++
++                      if (netfront_check_queue_ready(vnic->net_dev)) {
++                              netif_wake_queue(vnic->net_dev);
++                              NETFRONT_ACCEL_STATS_OP
++                                      (vnic->stats.queue_wakes++);
++                      }
++                      spin_unlock_irqrestore(&np->tx_lock, flags);
++
++              }
++              
++              /*
++               * Should never get a CANT, as it checks that before
++               * deciding it was BUSY first time round 
++               */
++              BUG_ON(handled == NETFRONT_ACCEL_STATUS_CANT);
++      }
++}
++
++
++static void netfront_accel_vi_tx_complete(netfront_accel_vnic *vnic, 
++                                        struct netfront_accel_tso_buffer *tso_buf,
++                                        int is_last)
++{
++      struct netfront_accel_tso_buffer *next;
++
++      /* 
++       * We get a single completion for every call to
++       * ef_vi_transmitv so handle any other buffers which are part
++       * of the same packet 
++       */
++      while (tso_buf != NULL) {
++              if (tso_buf->buf->skb != NULL) {
++                      dev_kfree_skb_any(tso_buf->buf->skb);
++                      tso_buf->buf->skb = NULL;
++              }
++
++              next = tso_buf->next;
++
++              netfront_accel_buf_put(vnic->tx_bufs, tso_buf->buf->buf_id);
++
++              tso_buf = next;
++      }
++
++      /*
++       * If this was the last one in the batch, we try and send any
++       * pending tx_skb. There should now be buffers and
++       * descriptors
++       */
++      if (is_last)
++              netfront_accel_vi_not_busy(vnic);
++}
++
++
++static void netfront_accel_vi_poll_process_tx(netfront_accel_vnic *vnic,
++                                            ef_event *ev)
++{
++      struct netfront_accel_pkt_desc *buf;
++      struct netfront_accel_tso_buffer *tso_buf;
++      ef_request_id ids[EF_VI_TRANSMIT_BATCH];
++      int i, n_ids;
++      unsigned long flags;
++
++      /* Get the request ids for this tx completion event. */
++      n_ids = ef_vi_transmit_unbundle(&vnic->vi, ev, ids);
++
++      /* Take the tx buffer spin lock and hold for the duration */
++      spin_lock_irqsave(&vnic->tx_lock, flags);
++
++      for (i = 0; i < n_ids; ++i) {
++              VPRINTK("Tx packet %d complete\n", ids[i]);
++              buf = netfront_accel_buf_find(vnic->tx_bufs, ids[i]);
++              NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_completions++);
++
++              tso_buf = (struct netfront_accel_tso_buffer *)
++                      (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
++              BUG_ON(tso_buf->buf != buf);
++
++              netfront_accel_vi_tx_complete(vnic, tso_buf, i == (n_ids-1));
++      }
++
++      spin_unlock_irqrestore(&vnic->tx_lock, flags);
++}
++
++
++int netfront_accel_vi_poll(netfront_accel_vnic *vnic, int rx_packets)
++{
++      ef_event ev[ACCEL_VI_POLL_EVENTS];
++      int rx_remain = rx_packets, rc, events, i;
++#if NETFRONT_ACCEL_STATS
++      int n_evs_polled = 0, rx_evs_polled = 0, tx_evs_polled = 0;
++#endif
++      BUG_ON(rx_packets <= 0);
++
++      events = ef_eventq_poll(&vnic->vi, ev, 
++                              min(rx_remain, ACCEL_VI_POLL_EVENTS));
++      i = 0;
++      NETFRONT_ACCEL_STATS_OP(n_evs_polled += events);
++
++      VPRINTK("%s: %d events\n", __FUNCTION__, events);
++
++      /* Loop over each event */
++      while (events) {
++              VPRINTK("%s: Event "EF_EVENT_FMT", index %lu\n", __FUNCTION__, 
++                      EF_EVENT_PRI_ARG(ev[i]),        
++                      (unsigned long)(vnic->vi.evq_state->evq_ptr));
++
++              if ((EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_RX) ||
++                  (EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_RX_DISCARD)) {
++                      rc = netfront_accel_vi_poll_process_rx(vnic, &ev[i]);
++                      rx_remain -= rc;
++                      BUG_ON(rx_remain < 0);
++                      NETFRONT_ACCEL_STATS_OP(rx_evs_polled++);
++              } else if (EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_TX) {
++                      netfront_accel_vi_poll_process_tx(vnic, &ev[i]);
++                      NETFRONT_ACCEL_STATS_OP(tx_evs_polled++);
++              } else if (EF_EVENT_TYPE(ev[i]) == 
++                         EF_EVENT_TYPE_RX_NO_DESC_TRUNC) {
++                      DPRINTK("%s: RX_NO_DESC_TRUNC " EF_EVENT_FMT "\n",
++                              __FUNCTION__, EF_EVENT_PRI_ARG(ev[i]));
++                      discard_jumbo_state(vnic);
++                      NETFRONT_ACCEL_STATS_OP(vnic->stats.rx_no_desc_trunc++);
++              } else {
++                      EPRINTK("Unexpected event " EF_EVENT_FMT "\n", 
++                              EF_EVENT_PRI_ARG(ev[i]));
++                      NETFRONT_ACCEL_STATS_OP(vnic->stats.bad_event_count++);
++              }
++
++              i++;
++
++              /* Carry on round the loop if more events and more space */
++              if (i == events) {
++                      if (rx_remain == 0)
++                              break;
++
++                      events = ef_eventq_poll(&vnic->vi, ev, 
++                                              min(rx_remain, 
++                                                  ACCEL_VI_POLL_EVENTS));
++                      i = 0;
++                      NETFRONT_ACCEL_STATS_OP(n_evs_polled += events);
++              }
++      }
++      
++#if NETFRONT_ACCEL_STATS
++      vnic->stats.event_count += n_evs_polled;
++      vnic->stats.event_count_since_irq += n_evs_polled;
++      if (n_evs_polled > vnic->stats.events_per_poll_max)
++              vnic->stats.events_per_poll_max = n_evs_polled;
++      if (rx_evs_polled > vnic->stats.events_per_poll_rx_max)
++              vnic->stats.events_per_poll_rx_max = rx_evs_polled;
++      if (tx_evs_polled > vnic->stats.events_per_poll_tx_max)
++              vnic->stats.events_per_poll_tx_max = tx_evs_polled;
++#endif
++
++      return rx_packets - rx_remain;
++}
++
++
++int netfront_accel_vi_enable_interrupts(netfront_accel_vnic *vnic)
++{
++      u32 sw_evq_ptr;
++
++      VPRINTK("%s: checking for event on %p\n", __FUNCTION__, &vnic->vi.evq_state);
++
++      BUG_ON(vnic == NULL);
++      BUG_ON(vnic->vi.evq_state == NULL);
++
++      /* Do a quick check for an event. */
++      if (ef_eventq_has_event(&vnic->vi)) {
++              VPRINTK("%s: found event\n",  __FUNCTION__);
++              return 0;
++      }
++
++      VPRINTK("evq_ptr=0x%08x  evq_mask=0x%08x\n",
++              vnic->evq_state.evq_ptr, vnic->vi.evq_mask);
++  
++      /* Request a wakeup from the hardware. */
++      sw_evq_ptr = vnic->evq_state.evq_ptr & vnic->vi.evq_mask;
++
++      BUG_ON(vnic->hw.falcon.evq_rptr == NULL);
++
++      VPRINTK("Requesting wakeup at 0x%08x, rptr %p\n", sw_evq_ptr,
++              vnic->hw.falcon.evq_rptr);
++      *(volatile u32 *)(vnic->hw.falcon.evq_rptr) = (sw_evq_ptr >> 3);
++
++      return 1;
++}
diff --cc drivers/xen/sfc_netfront/accel_xenbus.c

index 0000000,0000000..98d5334

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_xenbus.c
@@@ -1,0 -1,0 +1,775 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <linux/stddef.h>
++#include <linux/errno.h>
++
++#include <xen/xenbus.h>
++#include <xen/evtchn.h>
++#include <xen/gnttab.h>
++
++#include "accel.h"
++#include "accel_util.h"
++#include "accel_msg_iface.h"
++#include "accel_bufs.h"
++#include "accel_ssr.h"
++/* drivers/xen/netfront/netfront.h */
++#include "netfront.h"
++
++void netfront_accel_set_closing(netfront_accel_vnic *vnic) 
++{
++
++      vnic->frontend_state = XenbusStateClosing;
++      net_accel_update_state(vnic->dev, XenbusStateClosing);
++}
++      
++
++static void mac_address_change(struct xenbus_watch *watch,
++                             const char **vec, unsigned int len)
++{
++      netfront_accel_vnic *vnic;
++      struct xenbus_device *dev;
++      int rc;
++
++      DPRINTK("%s\n", __FUNCTION__);
++      
++      vnic = container_of(watch, netfront_accel_vnic, 
++                              mac_address_watch);
++      dev = vnic->dev;
++
++      rc = net_accel_xen_net_read_mac(dev, vnic->mac);
++
++      if (rc != 0)
++              EPRINTK("%s: failed to read mac (%d)\n", __FUNCTION__, rc);
++}
++
++
++static int setup_mac_address_watch(struct xenbus_device *dev,
++                                 netfront_accel_vnic *vnic)
++{
++      int err;
++
++      DPRINTK("Setting watch on %s/%s\n", dev->nodename, "mac");
++
++      err = xenbus_watch_path2(dev, dev->nodename, "mac", 
++                               &vnic->mac_address_watch, 
++                               mac_address_change);
++      if (err) {
++              EPRINTK("%s: Failed to register xenbus watch: %d\n",
++                      __FUNCTION__, err);
++              goto fail;
++      }
++
++      return 0;
++ fail:
++      vnic->mac_address_watch.node = NULL;
++      return err;
++}
++
++
++/* Grant access to some pages and publish through xenbus */
++static int make_named_grant(struct xenbus_device *dev, void *page, 
++                          const char *name, grant_ref_t *gnt_ref)
++{
++      struct xenbus_transaction tr;
++      int err;
++      grant_ref_t gnt;
++
++      gnt = net_accel_grant_page(dev, virt_to_mfn(page), 0);
++      if (gnt < 0)
++              return gnt;
++
++      do {
++              err = xenbus_transaction_start(&tr);
++              if (err != 0) {
++                      EPRINTK("%s: transaction start failed %d\n",
++                              __FUNCTION__, err);
++                      return err;
++              }
++              err = xenbus_printf(tr, dev->nodename, name, "%d", gnt);
++              if (err != 0) {
++                      EPRINTK("%s: xenbus_printf failed %d\n", __FUNCTION__,
++                              err);
++                      xenbus_transaction_end(tr, 1);
++                      return err;
++              }
++              err = xenbus_transaction_end(tr, 0);
++      } while (err == -EAGAIN);
++      
++      if (err != 0) {
++              EPRINTK("%s: transaction end failed %d\n", __FUNCTION__, err);
++              return err;
++      }
++      
++      *gnt_ref = gnt;
++
++      return 0;
++}
++
++
++static int remove_named_grant(struct xenbus_device *dev,
++                            const char *name, grant_ref_t gnt_ref)
++{
++      struct xenbus_transaction tr;
++      int err;
++
++      net_accel_ungrant_page(gnt_ref);
++
++      do {
++              err = xenbus_transaction_start(&tr);
++              if (err != 0) {
++                      EPRINTK("%s: transaction start failed %d\n",
++                              __FUNCTION__, err);
++                      return err;
++              }
++              err = xenbus_rm(tr, dev->nodename, name);
++              if (err != 0) {
++                      EPRINTK("%s: xenbus_rm failed %d\n", __FUNCTION__,
++                              err);
++                      xenbus_transaction_end(tr, 1);
++                      return err;
++              }
++              err = xenbus_transaction_end(tr, 0);
++      } while (err == -EAGAIN);
++      
++      if (err != 0) {
++              EPRINTK("%s: transaction end failed %d\n", __FUNCTION__, err);
++              return err;
++      }
++
++      return 0;
++}
++
++
++static 
++netfront_accel_vnic *netfront_accel_vnic_ctor(struct net_device *net_dev,
++                                            struct xenbus_device *dev)
++{
++      struct netfront_info *np =
++              (struct netfront_info *)netdev_priv(net_dev);
++      netfront_accel_vnic *vnic;
++      int err;
++
++      /*
++       * A bug in earlier versions of Xen accel plugin system meant
++       * you could be probed twice for the same device on suspend
++       * cancel.  Be tolerant of that.
++       */ 
++      if (np->accel_priv != NULL)
++              return ERR_PTR(-EALREADY);
++
++      /* Alloc mem for state */
++      vnic = kzalloc(sizeof(netfront_accel_vnic), GFP_KERNEL);
++      if (vnic == NULL) {
++              EPRINTK("%s: no memory for vnic state\n", __FUNCTION__);
++              return ERR_PTR(-ENOMEM);
++      }
++
++      spin_lock_init(&vnic->tx_lock);
++
++      mutex_init(&vnic->vnic_mutex);
++      mutex_lock(&vnic->vnic_mutex);
++
++      /* Store so state can be retrieved from device */
++      BUG_ON(np->accel_priv != NULL);
++      np->accel_priv = vnic;
++      vnic->dev = dev;
++      vnic->net_dev = net_dev;
++      spin_lock_init(&vnic->irq_enabled_lock);
++      netfront_accel_ssr_init(&vnic->ssr_state);
++
++      init_waitqueue_head(&vnic->state_wait_queue);
++      vnic->backend_state = XenbusStateUnknown;
++      vnic->frontend_state = XenbusStateClosed;
++      vnic->removing = 0;
++      vnic->domU_state_is_setup = 0;
++      vnic->dom0_state_is_setup = 0;
++      vnic->poll_enabled = 0;
++      vnic->tx_enabled = 0;
++      vnic->tx_skb = NULL;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
++      INIT_WORK(&vnic->msg_from_bend, netfront_accel_msg_from_bend);
++#else
++      INIT_WORK(&vnic->msg_from_bend, netfront_accel_msg_from_bend, vnic);
++#endif
++
++      netfront_accel_debugfs_create(vnic);
++
++      mutex_unlock(&vnic->vnic_mutex);
++
++      err = net_accel_xen_net_read_mac(dev, vnic->mac);
++      if (err) 
++              goto fail_mac;
++
++      /* Setup a watch on the frontend's MAC address */
++      err = setup_mac_address_watch(dev, vnic);
++      if (err)
++              goto fail_mac;
++
++      return vnic;
++
++fail_mac:
++
++      mutex_lock(&vnic->vnic_mutex);
++
++      netfront_accel_debugfs_remove(vnic);
++
++      netfront_accel_ssr_fini(vnic, &vnic->ssr_state);
++
++      EPRINTK_ON(vnic->tx_skb != NULL);
++
++      vnic->frontend_state = XenbusStateUnknown;
++      net_accel_update_state(dev, XenbusStateUnknown);
++
++      mutex_unlock(&vnic->vnic_mutex);
++
++      np->accel_priv = NULL;
++      kfree(vnic);
++
++      return ERR_PTR(err);
++}
++
++
++static void netfront_accel_vnic_dtor(netfront_accel_vnic *vnic)
++{
++      struct net_device *net_dev = vnic->net_dev;
++      struct netfront_info *np = 
++              (struct netfront_info *)netdev_priv(net_dev);
++
++      /*
++       * Now we don't hold the lock any more it is safe to remove
++       * this watch and synchonrise with the completion of
++       * watches
++       */
++      DPRINTK("%s: unregistering xenbus mac watch\n", __FUNCTION__);
++      unregister_xenbus_watch(&vnic->mac_address_watch);
++      kfree(vnic->mac_address_watch.node);
++
++      flush_workqueue(netfront_accel_workqueue);
++
++      mutex_lock(&vnic->vnic_mutex);
++
++      netfront_accel_debugfs_remove(vnic);
++
++      netfront_accel_ssr_fini(vnic, &vnic->ssr_state);
++
++      EPRINTK_ON(vnic->tx_skb != NULL);
++
++      vnic->frontend_state = XenbusStateUnknown;
++      net_accel_update_state(vnic->dev, XenbusStateUnknown);
++
++      mutex_unlock(&vnic->vnic_mutex);
++
++      np->accel_priv = NULL;
++      kfree(vnic);
++}
++
++
++static int vnic_setup_domU_shared_state(struct xenbus_device *dev,
++                                      netfront_accel_vnic *vnic)
++{
++      struct xenbus_transaction tr;
++      int err;
++      int msgs_per_queue;
++
++
++      DPRINTK("Setting up domU shared state.\n");
++
++      msgs_per_queue = (PAGE_SIZE/2) / sizeof(struct net_accel_msg);
++
++      /* Allocate buffer state */
++      vnic->tx_bufs = netfront_accel_init_bufs(&vnic->tx_lock);
++      if (vnic->tx_bufs == NULL) {
++              err = -ENOMEM;
++              EPRINTK("%s: Failed to allocate tx buffers\n", __FUNCTION__);
++              goto fail_tx_bufs;
++      }
++
++      vnic->rx_bufs = netfront_accel_init_bufs(NULL);
++      if (vnic->rx_bufs == NULL) {
++              err = -ENOMEM;
++              EPRINTK("%s: Failed to allocate rx buffers\n", __FUNCTION__);
++              goto fail_rx_bufs;
++      }
++
++      /* 
++       * This allocates two pages, one for the shared page and one
++       * for the message queue.
++       */
++      vnic->shared_page = (struct net_accel_shared_page *)
++              __get_free_pages(GFP_KERNEL, 1);
++      if (vnic->shared_page == NULL) {
++              EPRINTK("%s: no memory for shared pages\n", __FUNCTION__);
++              err = -ENOMEM;
++              goto fail_shared_page;
++      }
++
++      net_accel_msg_init_queue
++              (&vnic->from_dom0, &vnic->shared_page->queue0, 
++               (struct net_accel_msg *)((u8*)vnic->shared_page + PAGE_SIZE),
++               msgs_per_queue);
++
++      net_accel_msg_init_queue
++              (&vnic->to_dom0, &vnic->shared_page->queue1,
++               (struct net_accel_msg *)((u8*)vnic->shared_page +
++                                        (3 * PAGE_SIZE / 2)),
++               msgs_per_queue);
++      
++      vnic->msg_state = NETFRONT_ACCEL_MSG_NONE;
++
++      err = make_named_grant(dev, vnic->shared_page, "accel-ctrl-page",
++                             &vnic->ctrl_page_gnt);
++      if (err) {
++              EPRINTK("couldn't make ctrl-page named grant\n");
++              goto fail_ctrl_page_grant;
++      }
++
++      err = make_named_grant(dev, (u8*)vnic->shared_page + PAGE_SIZE,
++                             "accel-msg-page", &vnic->msg_page_gnt);
++      if (err) {
++              EPRINTK("couldn't make msg-page named grant\n");
++              goto fail_msg_page_grant;
++      }
++
++      /* Create xenbus msg event channel */
++      err = bind_listening_port_to_irqhandler
++              (dev->otherend_id, netfront_accel_msg_channel_irq_from_bend,
++               IRQF_SAMPLE_RANDOM, "vnicctrl", vnic);
++      if (err < 0) {
++              EPRINTK("Couldn't bind msg event channel\n");
++              goto fail_msg_irq;
++      }
++      vnic->msg_channel_irq = err;
++      vnic->msg_channel = irq_to_evtchn_port(vnic->msg_channel_irq);
++      
++      /* Create xenbus net event channel */
++      err = bind_listening_port_to_irqhandler
++              (dev->otherend_id, netfront_accel_net_channel_irq_from_bend,
++               IRQF_SAMPLE_RANDOM, "vnicfront", vnic);
++      if (err < 0) {
++              EPRINTK("Couldn't bind net event channel\n");
++              goto fail_net_irq;
++      }
++      vnic->net_channel_irq = err;
++      vnic->net_channel = irq_to_evtchn_port(vnic->net_channel_irq);
++      /* Want to ensure we don't get interrupts before we're ready */
++      netfront_accel_disable_net_interrupts(vnic);
++
++      DPRINTK("otherend %d has msg ch %u (%u) and net ch %u (%u)\n",
++              dev->otherend_id, vnic->msg_channel, vnic->msg_channel_irq, 
++              vnic->net_channel, vnic->net_channel_irq);
++
++      do {
++              err = xenbus_transaction_start(&tr);
++              if (err != 0) {
++                      EPRINTK("%s: Transaction start failed %d\n",
++                              __FUNCTION__, err);
++                      goto fail_transaction;
++              }
++
++              err = xenbus_printf(tr, dev->nodename, "accel-msg-channel",
++                                  "%u", vnic->msg_channel);
++              if (err != 0) {
++                      EPRINTK("%s: event channel xenbus write failed %d\n",
++                              __FUNCTION__, err);
++                      xenbus_transaction_end(tr, 1);
++                      goto fail_transaction;
++              }
++
++              err = xenbus_printf(tr, dev->nodename, "accel-net-channel",
++                                  "%u", vnic->net_channel);
++              if (err != 0) {
++                      EPRINTK("%s: net channel xenbus write failed %d\n",
++                              __FUNCTION__, err);
++                      xenbus_transaction_end(tr, 1);
++                      goto fail_transaction;
++              }
++
++              err = xenbus_transaction_end(tr, 0);
++      } while (err == -EAGAIN);
++
++      if (err != 0) {
++              EPRINTK("%s: Transaction end failed %d\n", __FUNCTION__, err);
++              goto fail_transaction;
++      }
++
++      DPRINTK("Completed setting up domU shared state\n");
++
++      return 0;
++
++fail_transaction:
++
++      unbind_from_irqhandler(vnic->net_channel_irq, vnic);
++fail_net_irq:
++
++      unbind_from_irqhandler(vnic->msg_channel_irq, vnic);
++fail_msg_irq:
++
++      remove_named_grant(dev, "accel-ctrl-page", vnic->ctrl_page_gnt);
++fail_msg_page_grant:
++
++      remove_named_grant(dev, "accel-msg-page", vnic->msg_page_gnt);
++fail_ctrl_page_grant:
++
++      free_pages((unsigned long)vnic->shared_page, 1);
++      vnic->shared_page = NULL;
++fail_shared_page:
++
++      netfront_accel_fini_bufs(vnic->rx_bufs);
++fail_rx_bufs:
++
++      netfront_accel_fini_bufs(vnic->tx_bufs);
++fail_tx_bufs:
++
++      /* Undo the memory allocation created when we got the HELLO */
++      netfront_accel_free_buffer_mem(&vnic->bufpages,
++                                     vnic->rx_bufs,
++                                     vnic->tx_bufs);
++
++      DPRINTK("Failed to setup domU shared state with code %d\n", err);
++
++      return err;
++}
++
++
++static void vnic_remove_domU_shared_state(struct xenbus_device *dev, 
++                                        netfront_accel_vnic *vnic)
++{
++      struct xenbus_transaction tr;
++      
++      /*
++       * Don't remove any watches because we currently hold the
++       * mutex and the watches take the mutex.
++       */
++
++      DPRINTK("%s: removing event channel irq handlers %d %d\n",
++              __FUNCTION__, vnic->net_channel_irq, vnic->msg_channel_irq);
++      do {
++              if (xenbus_transaction_start(&tr) != 0)
++                      break;
++              xenbus_rm(tr, dev->nodename, "accel-msg-channel");
++              xenbus_rm(tr, dev->nodename, "accel-net-channel");
++      } while (xenbus_transaction_end(tr, 0) == -EAGAIN);
++
++      unbind_from_irqhandler(vnic->net_channel_irq, vnic);
++      unbind_from_irqhandler(vnic->msg_channel_irq, vnic);
++
++      /* ungrant pages for msg channel */
++      remove_named_grant(dev, "accel-ctrl-page", vnic->ctrl_page_gnt);
++      remove_named_grant(dev, "accel-msg-page", vnic->msg_page_gnt);
++      free_pages((unsigned long)vnic->shared_page, 1);
++      vnic->shared_page = NULL;
++
++      /* ungrant pages for buffers, and free buffer memory */
++      netfront_accel_free_buffer_mem(&vnic->bufpages,
++                                     vnic->rx_bufs,
++                                     vnic->tx_bufs);
++      netfront_accel_fini_bufs(vnic->rx_bufs);
++      netfront_accel_fini_bufs(vnic->tx_bufs);
++}
++
++
++static void vnic_setup_dom0_shared_state(struct xenbus_device *dev,
++                                      netfront_accel_vnic *vnic)
++{
++      DPRINTK("Setting up dom0 shared state\n");
++
++      netfront_accel_vi_ctor(vnic);
++
++      /*
++       * Message processing will be enabled when this function
++       * returns, but we might have missed an interrupt.  Schedule a
++       * check just in case.
++       */
++      queue_work(netfront_accel_workqueue, &vnic->msg_from_bend);
++}
++
++
++static void vnic_remove_dom0_shared_state(struct xenbus_device *dev,
++                                        netfront_accel_vnic *vnic)
++{
++      DPRINTK("Removing dom0 shared state\n");
++
++      vnic_stop_fastpath(vnic);
++
++      netfront_accel_vi_dtor(vnic);
++}
++
++
++/*************************************************************************/
++
++/*
++ * The following code handles accelstate changes between the frontend
++ * and the backend.  In response to transitions, calls the following
++ * functions in matching pairs:
++ *
++ *   vnic_setup_domU_shared_state
++ *   vnic_remove_domU_shared_state
++ *
++ *   vnic_setup_dom0_shared_state
++ *   vnic_remove_dom0_shared_state
++ *
++ * Valid state transitions for DomU are as follows:
++ *
++ * Closed->Init       on probe or in response to Init from dom0
++ *
++ * Init->Connected    in response to Init from dom0
++ * Init->Closing      on error providing dom0 is in Init
++ * Init->Closed       on remove or in response to Closing from dom0
++ *
++ * Connected->Closing on error/remove
++ * Connected->Closed  in response to Closing from dom0
++ *
++ * Closing->Closed    in response to Closing from dom0
++ *
++ */
++
++
++/* Function to deal with Xenbus accel state change in backend */
++static void netfront_accel_backend_accel_changed(netfront_accel_vnic *vnic,
++                                               XenbusState backend_state)
++{
++      struct xenbus_device *dev = vnic->dev;
++      XenbusState frontend_state;
++      int state;
++
++      DPRINTK("%s: changing from %s to %s. nodename %s, otherend %s\n",
++              __FUNCTION__, xenbus_strstate(vnic->backend_state),
++              xenbus_strstate(backend_state), dev->nodename, dev->otherend);
++
++      /*
++       * Ignore duplicate state changes.  This can happen if the
++       * backend changes state twice in quick succession and the
++       * first watch fires in the frontend after the second
++       * transition has completed.
++       */
++      if (vnic->backend_state == backend_state)
++              return;
++
++      vnic->backend_state = backend_state;
++      frontend_state = vnic->frontend_state;
++
++      switch (backend_state) {
++      case XenbusStateInitialising:
++              /*
++               * It's possible for us to miss the closed state from
++               * dom0, so do the work here.
++               */
++              if (vnic->domU_state_is_setup) {
++                      vnic_remove_domU_shared_state(dev, vnic);
++                      vnic->domU_state_is_setup = 0;
++              }
++
++              if (frontend_state != XenbusStateInitialising) {
++                      /* Make sure the backend doesn't go away. */
++                      frontend_state = XenbusStateInitialising;
++                      net_accel_update_state(dev, frontend_state);
++                      xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", &state);
++                      backend_state = (XenbusState)state;
++                      if (backend_state != XenbusStateInitialising)
++                              break;
++              }
++
++              /* Start the new connection. */
++              if (!vnic->removing) {
++                      BUG_ON(vnic->domU_state_is_setup);
++                      if (vnic_setup_domU_shared_state(dev, vnic) == 0) {
++                              vnic->domU_state_is_setup = 1;
++                              frontend_state = XenbusStateConnected;
++                      } else
++                              frontend_state = XenbusStateClosing;
++              }
++              break;
++      case XenbusStateConnected:
++              if (vnic->domU_state_is_setup &&
++                  !vnic->dom0_state_is_setup) {
++                      vnic_setup_dom0_shared_state(dev, vnic);
++                      vnic->dom0_state_is_setup = 1;
++              }
++              break;
++      default:
++      case XenbusStateClosing:
++              if (vnic->dom0_state_is_setup) {
++                      vnic_remove_dom0_shared_state(dev, vnic);
++                      vnic->dom0_state_is_setup = 0;
++              }
++              frontend_state = XenbusStateClosed;
++              break;
++      case XenbusStateUnknown:
++      case XenbusStateClosed:
++              if (vnic->domU_state_is_setup) {
++                      vnic_remove_domU_shared_state(dev, vnic);
++                      vnic->domU_state_is_setup = 0;
++              }
++              break;
++      }
++
++      if (frontend_state != vnic->frontend_state) {
++              DPRINTK("Switching from state %s (%d) to %s (%d)\n",
++                      xenbus_strstate(vnic->frontend_state),
++                      vnic->frontend_state,
++                      xenbus_strstate(frontend_state), frontend_state);
++              vnic->frontend_state = frontend_state;
++              net_accel_update_state(dev, frontend_state);
++      }
++
++      wake_up(&vnic->state_wait_queue);
++}
++
++
++static void backend_accel_state_change(struct xenbus_watch *watch,
++                                     const char **vec, unsigned int len)
++{
++      int state;
++      netfront_accel_vnic *vnic;
++      struct xenbus_device *dev;
++
++      DPRINTK("%s\n", __FUNCTION__);
++
++      vnic = container_of(watch, struct netfront_accel_vnic,
++                              backend_accel_watch);
++
++      mutex_lock(&vnic->vnic_mutex);
++
++      dev = vnic->dev;
++
++      state = (int)XenbusStateUnknown;
++      xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", &state);
++      netfront_accel_backend_accel_changed(vnic, state);
++
++      mutex_unlock(&vnic->vnic_mutex);
++}
++
++
++static int setup_dom0_accel_watch(struct xenbus_device *dev,
++                                netfront_accel_vnic *vnic)
++{
++      int err;
++
++      DPRINTK("Setting watch on %s/%s\n", dev->otherend, "accelstate");
++
++      err = xenbus_watch_path2(dev, dev->otherend, "accelstate", 
++                               &vnic->backend_accel_watch, 
++                               backend_accel_state_change);
++      if (err) {
++              EPRINTK("%s: Failed to register xenbus watch: %d\n",
++                      __FUNCTION__, err);
++              goto fail;
++      }
++      return 0;
++ fail:
++      vnic->backend_accel_watch.node = NULL;
++      return err;
++}
++
++
++int netfront_accel_probe(struct net_device *net_dev, struct xenbus_device *dev)
++{
++      netfront_accel_vnic *vnic;
++      int err;
++
++      DPRINTK("Probe passed device %s\n", dev->nodename);
++
++      vnic = netfront_accel_vnic_ctor(net_dev, dev);
++      if (IS_ERR(vnic))
++              return PTR_ERR(vnic);
++
++      /*
++       * Setup a watch on the backend accel state.  This sets things
++       * going.
++       */
++      err = setup_dom0_accel_watch(dev, vnic);
++      if (err) {
++              netfront_accel_vnic_dtor(vnic);
++              EPRINTK("%s: probe failed with code %d\n", __FUNCTION__, err);
++              return err;
++      }
++
++      /*
++       * Indicate to the other end that we're ready to start unless
++       * the watch has already fired.
++       */
++      mutex_lock(&vnic->vnic_mutex);
++      VPRINTK("setup success, updating accelstate\n");
++      if (vnic->frontend_state == XenbusStateClosed) {
++              vnic->frontend_state = XenbusStateInitialising;
++              net_accel_update_state(dev, XenbusStateInitialising);
++      }
++      mutex_unlock(&vnic->vnic_mutex);
++
++      DPRINTK("Probe done device %s\n", dev->nodename);
++
++      return 0;
++}
++
++
++int netfront_accel_remove(struct xenbus_device *dev)
++{
++      struct netfront_info *np = dev_get_drvdata(&dev->dev);
++      netfront_accel_vnic *vnic = (netfront_accel_vnic *)np->accel_priv;
++
++      DPRINTK("%s %s\n", __FUNCTION__, dev->nodename);
++
++      BUG_ON(vnic == NULL);
++
++      mutex_lock(&vnic->vnic_mutex);
++
++      /* Reject any attempts to connect. */
++      vnic->removing = 1;
++
++      /* Close any existing connection. */
++      if (vnic->frontend_state == XenbusStateConnected) {
++              vnic->frontend_state = XenbusStateClosing;
++              net_accel_update_state(dev, XenbusStateClosing);
++      }
++
++      mutex_unlock(&vnic->vnic_mutex);
++
++      DPRINTK("%s waiting for release of %s\n", __FUNCTION__, dev->nodename);
++
++      /*
++       * Wait for the xenbus watch to release the shared resources.
++       * This indicates that dom0 has made the transition
++       * Closing->Closed or that dom0 was in Closed or Init and no
++       * resources were mapped.
++       */
++      wait_event(vnic->state_wait_queue,
++                 !vnic->domU_state_is_setup);
++
++      /*
++       * Now we don't need this watch anymore it is safe to remove
++       * it (and so synchronise with it completing if outstanding)
++       */
++      DPRINTK("%s: unregistering xenbus accel watch\n",
++              __FUNCTION__);
++      unregister_xenbus_watch(&vnic->backend_accel_watch);
++      kfree(vnic->backend_accel_watch.node);
++
++      netfront_accel_vnic_dtor(vnic);
++
++      DPRINTK("%s done %s\n", __FUNCTION__, dev->nodename);
++
++      return 0;
++}
diff --cc drivers/xen/sfc_netfront/ef_vi_falcon.h

index 0000000,0000000..9aaf4ca

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon.h
@@@ -1,0 -1,0 +1,172 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * \author  slp
++ *  \brief  Falcon specific definitions
++ *   \date  2004/08
++ */
++
++#ifndef __EF_VI_FALCON_H__
++#define __EF_VI_FALCON_H__    
++
++#define EFHW_4K               0x00001000u
++#define EFHW_8K               0x00002000u
++
++/* include the autogenerated register definitions */
++
++#include "ef_vi_falcon_core.h"
++#include "ef_vi_falcon_desc.h"
++#include "ef_vi_falcon_event.h"
++
++
++/*----------------------------------------------------------------------------
++ *
++ * Helpers to turn bit shifts into dword shifts and check that the bit fields 
++ * haven't overflown the dword etc. Aim is to preserve consistency with the 
++ * autogenerated headers - once stable we could hard code.
++ *
++ *---------------------------------------------------------------------------*/
++
++/* mask constructors */
++#define __FALCON_MASK(WIDTH,T)  ((((T)1) << (WIDTH)) - 1)
++#define __EFVI_MASK32(WIDTH)  __FALCON_MASK((WIDTH),uint32_t)
++#define __EFVI_MASK64(WIDTH)  __FALCON_MASK((WIDTH),uint64_t)
++
++#define __EFVI_FALCON_MASKFIELD32(LBN, WIDTH)   ((uint32_t)  \
++                             (__EFVI_MASK32(WIDTH) << (LBN)))
++
++/* constructors for fields which span the first and second dwords */
++#define __LW(LBN) (32 - LBN)
++#define LOW(v, LBN, WIDTH)   ((uint32_t)  \
++                               (((v) & __EFVI_MASK64(__LW((LBN)))) << (LBN)))
++#define HIGH(v, LBN, WIDTH)  ((uint32_t)(((v) >> __LW((LBN))) & \
++                                       __EFVI_MASK64((WIDTH - __LW((LBN))))))
++/* constructors for fields within the second dword */
++#define __DW2(LBN)      ((LBN) - 32)
++
++/* constructors for fields which span the second and third dwords */
++#define __LW2(LBN) (64 - LBN)
++#define LOW2(v, LBN, WIDTH) ((uint32_t) \
++                       (((v) & __EFVI_MASK64(__LW2((LBN)))) << ((LBN) - 32)))
++#define HIGH2(v, LBN, WIDTH)  ((uint32_t) \
++             (((v) >> __LW2((LBN))) & __EFVI_MASK64((WIDTH - __LW2((LBN))))))
++
++/* constructors for fields within the third dword */
++#define __DW3(LBN)      ((LBN) - 64)
++
++                              
++/* constructors for fields which span the third and fourth dwords */
++#define __LW3(LBN) (96 - LBN)
++#define LOW3(v, LBN, WIDTH)   ((uint32_t)    \
++              (((v) & __EFVI_MASK64(__LW3((LBN)))) << ((LBN) - 64)))
++#define HIGH3(v, LBN, WIDTH)  ((unit32_t)    \
++             (((v) >> __LW3((LBN))) & __EFVI_MASK64((WIDTH - __LW3((LBN))))))
++
++/* constructors for fields within the fourth dword */
++#define __DW4(LBN)      ((LBN) - 96)
++
++/* checks that the autogenerated headers our consistent with our model */
++#define WIDTHCHCK(a, b) ef_assert((a) == (b))
++#define RANGECHCK(v, WIDTH) \
++                ef_assert(((uint64_t)(v) & ~(__EFVI_MASK64((WIDTH)))) == 0)
++
++/* fields within the first dword */
++#define DWCHCK(LBN, WIDTH) ef_assert(((LBN) >= 0) &&(((LBN)+(WIDTH)) <= 32))
++
++/* fields which span the first and second dwords */
++#define LWCHK(LBN, WIDTH)  ef_assert(WIDTH >= __LW(LBN))
++
++/*----------------------------------------------------------------------------
++ *
++ * Buffer virtual addresses (4K buffers) 
++ *
++ *---------------------------------------------------------------------------*/
++
++/* Form a buffer virtual address from buffer ID and offset.  If the offset
++** is larger than the buffer size, then the buffer indexed will be
++** calculated appropriately.  It is the responsibility of the caller to
++** ensure that they have valid buffers programmed at that address.
++*/
++#define EFVI_FALCON_VADDR_4K_S        (12)         
++#define EFVI_FALCON_VADDR_M       0xfffff             /* post shift mask  */
++
++
++#define EFVI_FALCON_BUFFER_4K_ADDR(id,off)      \
++  (((id) << EFVI_FALCON_VADDR_4K_S) + (off))
++
++#define EFVI_FALCON_BUFFER_4K_PAGE(vaddr)                       \
++  (((vaddr) >> EFVI_FALCON_VADDR_4K_S) & EFVI_FALCON_VADDR_M)
++
++#define EFVI_FALCON_BUFFER_4K_OFF(vaddr)                \
++  ((vaddr) & __EFVI_MASK32(EFVI_FALCON_VADDR_4K_S))
++
++
++/*----------------------------------------------------------------------------
++ *
++ * Masks
++ *
++ *---------------------------------------------------------------------------*/
++
++#define EFVI_FALCON_CLOCK_ASIC_HZ    (125000)
++#define EFVI_FALCON_CLOCK_FPGA_HZ    (62500)
++#define EFVI_FALCON_CLOCK_HZ         EFVI_FALCON_CLOCK_ASIC_HZ
++
++
++/*----------------------------------------------------------------------------
++ *
++ * Timers
++ *
++ *---------------------------------------------------------------------------*/
++
++/* Event-Queue Timer granularity - measured in us 
++   Given by: 4096 * 3 cycle * clock period */
++
++#define EFVI_FALCON_EVQTIMER_PERIOD_US   ((4096 * 3 * 1000) / EFVI_FALCON_CLOCK_HZ)
++
++/* mode bits */
++#define EFVI_FALCON_TIMER_MODE_DIS     0     /* disabled */
++#define EFVI_FALCON_TIMER_MODE_RUN     1     /* started counting right away */
++#define EFVI_FALCON_TIMER_MODE_HOLD    2     /* trigger mode (user queues) */
++
++#define EFVI_FALCON_EVQTIMER_HOLD     (EFVI_FALCON_TIMER_MODE_HOLD << TIMER_MODE_LBN)
++#define EFVI_FALCON_EVQTIMER_RUN      (EFVI_FALCON_TIMER_MODE_RUN  << TIMER_MODE_LBN)
++#define EFVI_FALCON_EVQTIMER_DISABLE  (EFVI_FALCON_TIMER_MODE_DIS  << TIMER_MODE_LBN) 
++
++
++/* ---- ef_vi_event helpers --- */
++
++#define EFVI_FALCON_EVENT_CODE(evp) \
++       ((evp)->u64 & EFVI_FALCON_EVENT_CODE_MASK)
++
++#define EFVI_FALCON_EVENT_SW_DATA_MASK    0x0000ffff
++
++#define __EFVI_FALCON_OPEN_MASK(WIDTH)  ((((uint64_t)1) << (WIDTH)) - 1)
++
++#define EFVI_FALCON_EVENT_CODE_MASK \
++           (__EFVI_FALCON_OPEN_MASK(EV_CODE_WIDTH) << EV_CODE_LBN)
++
++
++#endif  /* __EF_VI_FALCON_H__ */
diff --cc drivers/xen/sfc_netfront/ef_vi_falcon_core.h

index 0000000,0000000..089f42a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon_core.h
@@@ -1,0 -1,0 +1,1075 @@@
++
++#define  EFVI_FALCON_EXTENDED_P_BAR 1
++
++//////////////---- Bus Interface Unit Registers C Header ----//////////////
++#define IOM_IND_ADR_REG_OFST 0x0 // IO-mapped indirect access address register
++  #define IOM_AUTO_ADR_INC_EN_LBN 16
++  #define IOM_AUTO_ADR_INC_EN_WIDTH 1
++  #define IOM_IND_ADR_LBN 0
++  #define IOM_IND_ADR_WIDTH 16
++#define IOM_IND_DAT_REG_OFST 0x4 // IO-mapped indirect access data register
++  #define IOM_IND_DAT_LBN 0
++  #define IOM_IND_DAT_WIDTH 32
++#define ADR_REGION_REG_KER_OFST 0x0 // Address region register
++#define ADR_REGION_REG_OFST 0x0 // Address region register
++  #define ADR_REGION3_LBN 96
++  #define ADR_REGION3_WIDTH 18
++  #define ADR_REGION2_LBN 64
++  #define ADR_REGION2_WIDTH 18
++  #define ADR_REGION1_LBN 32
++  #define ADR_REGION1_WIDTH 18
++  #define ADR_REGION0_LBN 0
++  #define ADR_REGION0_WIDTH 18
++#define INT_EN_REG_KER_OFST 0x10 // Kernel driver Interrupt enable register
++  #define KER_INT_CHAR_LBN 4
++  #define KER_INT_CHAR_WIDTH 1
++  #define KER_INT_KER_LBN 3
++  #define KER_INT_KER_WIDTH 1
++  #define ILL_ADR_ERR_INT_EN_KER_LBN 2
++  #define ILL_ADR_ERR_INT_EN_KER_WIDTH 1
++  #define SRM_PERR_INT_EN_KER_LBN 1
++  #define SRM_PERR_INT_EN_KER_WIDTH 1
++  #define DRV_INT_EN_KER_LBN 0
++  #define DRV_INT_EN_KER_WIDTH 1
++#define INT_EN_REG_CHAR_OFST 0x20 // Char Driver interrupt enable register
++  #define CHAR_INT_CHAR_LBN 4
++  #define CHAR_INT_CHAR_WIDTH 1
++  #define CHAR_INT_KER_LBN 3
++  #define CHAR_INT_KER_WIDTH 1
++  #define ILL_ADR_ERR_INT_EN_CHAR_LBN 2
++  #define ILL_ADR_ERR_INT_EN_CHAR_WIDTH 1
++  #define SRM_PERR_INT_EN_CHAR_LBN 1
++  #define SRM_PERR_INT_EN_CHAR_WIDTH 1
++  #define DRV_INT_EN_CHAR_LBN 0
++  #define DRV_INT_EN_CHAR_WIDTH 1
++#define INT_ADR_REG_KER_OFST 0x30 // Interrupt host address for Kernel driver
++  #define INT_ADR_KER_LBN 0
++  #define INT_ADR_KER_WIDTH 64
++  #define DRV_INT_KER_LBN 32
++  #define DRV_INT_KER_WIDTH 1
++  #define EV_FF_HALF_INT_KER_LBN 3
++  #define EV_FF_HALF_INT_KER_WIDTH 1
++  #define EV_FF_FULL_INT_KER_LBN 2
++  #define EV_FF_FULL_INT_KER_WIDTH 1
++  #define ILL_ADR_ERR_INT_KER_LBN 1
++  #define ILL_ADR_ERR_INT_KER_WIDTH 1
++  #define SRAM_PERR_INT_KER_LBN 0
++  #define SRAM_PERR_INT_KER_WIDTH 1
++#define INT_ADR_REG_CHAR_OFST 0x40 // Interrupt host address for Char driver
++  #define INT_ADR_CHAR_LBN 0
++  #define INT_ADR_CHAR_WIDTH 64
++  #define DRV_INT_CHAR_LBN 32
++  #define DRV_INT_CHAR_WIDTH 1
++  #define EV_FF_HALF_INT_CHAR_LBN 3
++  #define EV_FF_HALF_INT_CHAR_WIDTH 1
++  #define EV_FF_FULL_INT_CHAR_LBN 2
++  #define EV_FF_FULL_INT_CHAR_WIDTH 1
++  #define ILL_ADR_ERR_INT_CHAR_LBN 1
++  #define ILL_ADR_ERR_INT_CHAR_WIDTH 1
++  #define SRAM_PERR_INT_CHAR_LBN 0
++  #define SRAM_PERR_INT_CHAR_WIDTH 1
++#define INT_ISR0_B0_OFST 0x90 // B0 only
++#define INT_ISR1_B0_OFST 0xA0
++#define INT_ACK_REG_KER_A1_OFST 0x50 // Kernel interrupt acknowledge register
++  #define RESERVED_LBN 0
++  #define RESERVED_WIDTH 32
++#define INT_ACK_REG_CHAR_A1_OFST 0x60 // CHAR interrupt acknowledge register
++  #define RESERVED_LBN 0
++  #define RESERVED_WIDTH 32
++//////////////---- Global CSR Registers C Header ----//////////////
++#define STRAP_REG_KER_OFST 0x200 // ASIC strap status register
++#define STRAP_REG_OFST 0x200 // ASIC strap status register
++  #define ONCHIP_SRAM_LBN 16
++  #define ONCHIP_SRAM_WIDTH 0
++  #define STRAP_ISCSI_EN_LBN 3
++  #define STRAP_ISCSI_EN_WIDTH 1
++  #define STRAP_PINS_LBN 0
++  #define STRAP_PINS_WIDTH 3
++#define GPIO_CTL_REG_KER_OFST 0x210 // GPIO control register
++#define GPIO_CTL_REG_OFST 0x210 // GPIO control register
++  #define GPIO_OEN_LBN 24
++  #define GPIO_OEN_WIDTH 4
++  #define GPIO_OUT_LBN 16
++  #define GPIO_OUT_WIDTH 4
++  #define GPIO_IN_LBN 8
++  #define GPIO_IN_WIDTH 4
++  #define GPIO_PWRUP_VALUE_LBN 0
++  #define GPIO_PWRUP_VALUE_WIDTH 4
++#define GLB_CTL_REG_KER_OFST 0x220 // Global control register
++#define GLB_CTL_REG_OFST 0x220 // Global control register
++  #define SWRST_LBN 0
++  #define SWRST_WIDTH 1
++#define FATAL_INTR_REG_KER_OFST 0x230 // Fatal interrupt register for Kernel
++  #define PCI_BUSERR_INT_KER_EN_LBN 43
++  #define PCI_BUSERR_INT_KER_EN_WIDTH 1
++  #define SRAM_OOB_INT_KER_EN_LBN 42
++  #define SRAM_OOB_INT_KER_EN_WIDTH 1
++  #define BUFID_OOB_INT_KER_EN_LBN 41
++  #define BUFID_OOB_INT_KER_EN_WIDTH 1
++  #define MEM_PERR_INT_KER_EN_LBN 40
++  #define MEM_PERR_INT_KER_EN_WIDTH 1
++  #define RBUF_OWN_INT_KER_EN_LBN 39
++  #define RBUF_OWN_INT_KER_EN_WIDTH 1
++  #define TBUF_OWN_INT_KER_EN_LBN 38
++  #define TBUF_OWN_INT_KER_EN_WIDTH 1
++  #define RDESCQ_OWN_INT_KER_EN_LBN 37
++  #define RDESCQ_OWN_INT_KER_EN_WIDTH 1
++  #define TDESCQ_OWN_INT_KER_EN_LBN 36
++  #define TDESCQ_OWN_INT_KER_EN_WIDTH 1
++  #define EVQ_OWN_INT_KER_EN_LBN 35
++  #define EVQ_OWN_INT_KER_EN_WIDTH 1
++  #define EVFF_OFLO_INT_KER_EN_LBN 34
++  #define EVFF_OFLO_INT_KER_EN_WIDTH 1
++  #define ILL_ADR_INT_KER_EN_LBN 33
++  #define ILL_ADR_INT_KER_EN_WIDTH 1
++  #define SRM_PERR_INT_KER_EN_LBN 32
++  #define SRM_PERR_INT_KER_EN_WIDTH 1
++  #define PCI_BUSERR_INT_KER_LBN 11
++  #define PCI_BUSERR_INT_KER_WIDTH 1
++  #define SRAM_OOB_INT_KER_LBN 10
++  #define SRAM_OOB_INT_KER_WIDTH 1
++  #define BUFID_OOB_INT_KER_LBN 9
++  #define BUFID_OOB_INT_KER_WIDTH 1
++  #define MEM_PERR_INT_KER_LBN 8
++  #define MEM_PERR_INT_KER_WIDTH 1
++  #define RBUF_OWN_INT_KER_LBN 7
++  #define RBUF_OWN_INT_KER_WIDTH 1
++  #define TBUF_OWN_INT_KER_LBN 6
++  #define TBUF_OWN_INT_KER_WIDTH 1
++  #define RDESCQ_OWN_INT_KER_LBN 5
++  #define RDESCQ_OWN_INT_KER_WIDTH 1
++  #define TDESCQ_OWN_INT_KER_LBN 4
++  #define TDESCQ_OWN_INT_KER_WIDTH 1
++  #define EVQ_OWN_INT_KER_LBN 3
++  #define EVQ_OWN_INT_KER_WIDTH 1
++  #define EVFF_OFLO_INT_KER_LBN 2
++  #define EVFF_OFLO_INT_KER_WIDTH 1
++  #define ILL_ADR_INT_KER_LBN 1
++  #define ILL_ADR_INT_KER_WIDTH 1
++  #define SRM_PERR_INT_KER_LBN 0
++  #define SRM_PERR_INT_KER_WIDTH 1
++#define FATAL_INTR_REG_OFST 0x240 // Fatal interrupt register for Char
++  #define PCI_BUSERR_INT_CHAR_EN_LBN 43
++  #define PCI_BUSERR_INT_CHAR_EN_WIDTH 1
++  #define SRAM_OOB_INT_CHAR_EN_LBN 42
++  #define SRAM_OOB_INT_CHAR_EN_WIDTH 1
++  #define BUFID_OOB_INT_CHAR_EN_LBN 41
++  #define BUFID_OOB_INT_CHAR_EN_WIDTH 1
++  #define MEM_PERR_INT_CHAR_EN_LBN 40
++  #define MEM_PERR_INT_CHAR_EN_WIDTH 1
++  #define RBUF_OWN_INT_CHAR_EN_LBN 39
++  #define RBUF_OWN_INT_CHAR_EN_WIDTH 1
++  #define TBUF_OWN_INT_CHAR_EN_LBN 38
++  #define TBUF_OWN_INT_CHAR_EN_WIDTH 1
++  #define RDESCQ_OWN_INT_CHAR_EN_LBN 37
++  #define RDESCQ_OWN_INT_CHAR_EN_WIDTH 1
++  #define TDESCQ_OWN_INT_CHAR_EN_LBN 36
++  #define TDESCQ_OWN_INT_CHAR_EN_WIDTH 1
++  #define EVQ_OWN_INT_CHAR_EN_LBN 35
++  #define EVQ_OWN_INT_CHAR_EN_WIDTH 1
++  #define EVFF_OFLO_INT_CHAR_EN_LBN 34
++  #define EVFF_OFLO_INT_CHAR_EN_WIDTH 1
++  #define ILL_ADR_INT_CHAR_EN_LBN 33
++  #define ILL_ADR_INT_CHAR_EN_WIDTH 1
++  #define SRM_PERR_INT_CHAR_EN_LBN 32
++  #define SRM_PERR_INT_CHAR_EN_WIDTH 1
++  #define FATAL_INTR_REG_EN_BITS    0xffffffffffffffffULL
++  #define PCI_BUSERR_INT_CHAR_LBN 11
++  #define PCI_BUSERR_INT_CHAR_WIDTH 1
++  #define SRAM_OOB_INT_CHAR_LBN 10
++  #define SRAM_OOB_INT_CHAR_WIDTH 1
++  #define BUFID_OOB_INT_CHAR_LBN 9
++  #define BUFID_OOB_INT_CHAR_WIDTH 1
++  #define MEM_PERR_INT_CHAR_LBN 8
++  #define MEM_PERR_INT_CHAR_WIDTH 1
++  #define RBUF_OWN_INT_CHAR_LBN 7
++  #define RBUF_OWN_INT_CHAR_WIDTH 1
++  #define TBUF_OWN_INT_CHAR_LBN 6
++  #define TBUF_OWN_INT_CHAR_WIDTH 1
++  #define RDESCQ_OWN_INT_CHAR_LBN 5
++  #define RDESCQ_OWN_INT_CHAR_WIDTH 1
++  #define TDESCQ_OWN_INT_CHAR_LBN 4
++  #define TDESCQ_OWN_INT_CHAR_WIDTH 1
++  #define EVQ_OWN_INT_CHAR_LBN 3
++  #define EVQ_OWN_INT_CHAR_WIDTH 1
++  #define EVFF_OFLO_INT_CHAR_LBN 2
++  #define EVFF_OFLO_INT_CHAR_WIDTH 1
++  #define ILL_ADR_INT_CHAR_LBN 1
++  #define ILL_ADR_INT_CHAR_WIDTH 1
++  #define SRM_PERR_INT_CHAR_LBN 0
++  #define SRM_PERR_INT_CHAR_WIDTH 1
++#define DP_CTRL_REG_OFST 0x250 // Datapath control register
++  #define FLS_EVQ_ID_LBN 0
++  #define FLS_EVQ_ID_WIDTH 12
++#define MEM_STAT_REG_KER_OFST 0x260 // Memory status register
++#define MEM_STAT_REG_OFST 0x260 // Memory status register
++  #define MEM_PERR_VEC_LBN 53
++  #define MEM_PERR_VEC_WIDTH 38
++  #define MBIST_CORR_LBN 38
++  #define MBIST_CORR_WIDTH 15
++  #define MBIST_ERR_LBN 0
++  #define MBIST_ERR_WIDTH 38
++#define DEBUG_REG_KER_OFST 0x270 // Debug register
++#define DEBUG_REG_OFST 0x270 // Debug register
++  #define DEBUG_BLK_SEL2_LBN 47
++  #define DEBUG_BLK_SEL2_WIDTH 3
++  #define DEBUG_BLK_SEL1_LBN 44
++  #define DEBUG_BLK_SEL1_WIDTH 3
++  #define DEBUG_BLK_SEL0_LBN 41
++  #define DEBUG_BLK_SEL0_WIDTH 3
++  #define MISC_DEBUG_ADDR_LBN 36
++  #define MISC_DEBUG_ADDR_WIDTH 5
++  #define SERDES_DEBUG_ADDR_LBN 31
++  #define SERDES_DEBUG_ADDR_WIDTH 5
++  #define EM_DEBUG_ADDR_LBN 26
++  #define EM_DEBUG_ADDR_WIDTH 5
++  #define SR_DEBUG_ADDR_LBN 21
++  #define SR_DEBUG_ADDR_WIDTH 5
++  #define EV_DEBUG_ADDR_LBN 16
++  #define EV_DEBUG_ADDR_WIDTH 5
++  #define RX_DEBUG_ADDR_LBN 11
++  #define RX_DEBUG_ADDR_WIDTH 5
++  #define TX_DEBUG_ADDR_LBN 6
++  #define TX_DEBUG_ADDR_WIDTH 5
++  #define BIU_DEBUG_ADDR_LBN 1
++  #define BIU_DEBUG_ADDR_WIDTH 5
++  #define DEBUG_EN_LBN 0
++  #define DEBUG_EN_WIDTH 1
++#define DRIVER_REG0_KER_OFST 0x280 // Driver scratch register 0
++#define DRIVER_REG0_OFST 0x280 // Driver scratch register 0
++  #define DRIVER_DW0_LBN 0
++  #define DRIVER_DW0_WIDTH 32
++#define DRIVER_REG1_KER_OFST 0x290 // Driver scratch register 1
++#define DRIVER_REG1_OFST 0x290 // Driver scratch register 1
++  #define DRIVER_DW1_LBN 0
++  #define DRIVER_DW1_WIDTH 32
++#define DRIVER_REG2_KER_OFST 0x2A0 // Driver scratch register 2
++#define DRIVER_REG2_OFST 0x2A0 // Driver scratch register 2
++  #define DRIVER_DW2_LBN 0
++  #define DRIVER_DW2_WIDTH 32
++#define DRIVER_REG3_KER_OFST 0x2B0 // Driver scratch register 3
++#define DRIVER_REG3_OFST 0x2B0 // Driver scratch register 3
++  #define DRIVER_DW3_LBN 0
++  #define DRIVER_DW3_WIDTH 32
++#define DRIVER_REG4_KER_OFST 0x2C0 // Driver scratch register 4
++#define DRIVER_REG4_OFST 0x2C0 // Driver scratch register 4
++  #define DRIVER_DW4_LBN 0
++  #define DRIVER_DW4_WIDTH 32
++#define DRIVER_REG5_KER_OFST 0x2D0 // Driver scratch register 5
++#define DRIVER_REG5_OFST 0x2D0 // Driver scratch register 5
++  #define DRIVER_DW5_LBN 0
++  #define DRIVER_DW5_WIDTH 32
++#define DRIVER_REG6_KER_OFST 0x2E0 // Driver scratch register 6
++#define DRIVER_REG6_OFST 0x2E0 // Driver scratch register 6
++  #define DRIVER_DW6_LBN 0
++  #define DRIVER_DW6_WIDTH 32
++#define DRIVER_REG7_KER_OFST 0x2F0 // Driver scratch register 7
++#define DRIVER_REG7_OFST 0x2F0 // Driver scratch register 7
++  #define DRIVER_DW7_LBN 0
++  #define DRIVER_DW7_WIDTH 32
++#define ALTERA_BUILD_REG_OFST 0x300 // Altera build register
++#define ALTERA_BUILD_REG_OFST 0x300 // Altera build register
++  #define ALTERA_BUILD_VER_LBN 0
++  #define ALTERA_BUILD_VER_WIDTH 32
++
++/* so called CSR spare register 
++    - contains separate parity enable bits for the various internal memory blocks */
++#define MEM_PARITY_ERR_EN_REG_KER 0x310 
++#define MEM_PARITY_ALL_BLOCKS_EN_LBN 64
++#define MEM_PARITY_ALL_BLOCKS_EN_WIDTH 38
++#define MEM_PARITY_TX_DATA_EN_LBN   72
++#define MEM_PARITY_TX_DATA_EN_WIDTH 2
++
++//////////////---- Event & Timer Module Registers C Header ----//////////////
++
++#if EFVI_FALCON_EXTENDED_P_BAR
++#define EVQ_RPTR_REG_KER_OFST 0x11B00 // Event queue read pointer register
++#else
++#define EVQ_RPTR_REG_KER_OFST 0x1B00 // Event queue read pointer register
++#endif
++
++#define EVQ_RPTR_REG_OFST 0xFA0000 // Event queue read pointer register array.
++  #define EVQ_RPTR_LBN 0
++  #define EVQ_RPTR_WIDTH 15
++
++#if EFVI_FALCON_EXTENDED_P_BAR
++#define EVQ_PTR_TBL_KER_OFST 0x11A00 // Event queue pointer table for kernel access
++#else
++#define EVQ_PTR_TBL_KER_OFST 0x1A00 // Event queue pointer table for kernel access
++#endif
++
++#define EVQ_PTR_TBL_CHAR_OFST 0xF60000 // Event queue pointer table for char direct access
++  #define EVQ_WKUP_OR_INT_EN_LBN 39
++  #define EVQ_WKUP_OR_INT_EN_WIDTH 1
++  #define EVQ_NXT_WPTR_LBN 24
++  #define EVQ_NXT_WPTR_WIDTH 15
++  #define EVQ_EN_LBN 23
++  #define EVQ_EN_WIDTH 1
++  #define EVQ_SIZE_LBN 20
++  #define EVQ_SIZE_WIDTH 3
++  #define EVQ_BUF_BASE_ID_LBN 0
++  #define EVQ_BUF_BASE_ID_WIDTH 20
++#define TIMER_CMD_REG_KER_OFST 0x420 // Timer table for kernel access. Page-mapped
++#define TIMER_CMD_REG_PAGE4_OFST 0x8420 // Timer table for user-level access. Page-mapped. For lowest 1K queues.
++#define TIMER_CMD_REG_PAGE123K_OFST 0x1000420 // Timer table for user-level access. Page-mapped. For upper 3K queues.
++#define TIMER_TBL_OFST 0xF70000 // Timer table for char driver direct access
++  #define TIMER_MODE_LBN 12
++  #define TIMER_MODE_WIDTH 2
++  #define TIMER_VAL_LBN 0
++  #define TIMER_VAL_WIDTH 12
++  #define TIMER_MODE_INT_HLDOFF 2
++  #define EVQ_BUF_SIZE_LBN 0
++  #define EVQ_BUF_SIZE_WIDTH 1
++#define DRV_EV_REG_KER_OFST 0x440 // Driver generated event register
++#define DRV_EV_REG_OFST 0x440 // Driver generated event register
++  #define DRV_EV_QID_LBN 64
++  #define DRV_EV_QID_WIDTH 12
++  #define DRV_EV_DATA_LBN 0
++  #define DRV_EV_DATA_WIDTH 64
++#define EVQ_CTL_REG_KER_OFST 0x450 // Event queue control register
++#define EVQ_CTL_REG_OFST 0x450 // Event queue control register
++  #define RX_EVQ_WAKEUP_MASK_B0_LBN 15
++  #define RX_EVQ_WAKEUP_MASK_B0_WIDTH 6
++  #define EVQ_OWNERR_CTL_LBN 14
++  #define EVQ_OWNERR_CTL_WIDTH 1
++  #define EVQ_FIFO_AF_TH_LBN 8
++  #define EVQ_FIFO_AF_TH_WIDTH 6
++  #define EVQ_FIFO_NOTAF_TH_LBN 0
++  #define EVQ_FIFO_NOTAF_TH_WIDTH 6
++//////////////---- SRAM Module Registers C Header ----//////////////
++#define BUF_TBL_CFG_REG_KER_OFST 0x600 // Buffer table configuration register
++#define BUF_TBL_CFG_REG_OFST 0x600 // Buffer table configuration register
++  #define BUF_TBL_MODE_LBN 3
++  #define BUF_TBL_MODE_WIDTH 1
++#define SRM_RX_DC_CFG_REG_KER_OFST 0x610 // SRAM receive descriptor cache configuration register
++#define SRM_RX_DC_CFG_REG_OFST 0x610 // SRAM receive descriptor cache configuration register
++  #define SRM_RX_DC_BASE_ADR_LBN 0
++  #define SRM_RX_DC_BASE_ADR_WIDTH 21
++#define SRM_TX_DC_CFG_REG_KER_OFST 0x620 // SRAM transmit descriptor cache configuration register
++#define SRM_TX_DC_CFG_REG_OFST 0x620 // SRAM transmit descriptor cache configuration register
++  #define SRM_TX_DC_BASE_ADR_LBN 0
++  #define SRM_TX_DC_BASE_ADR_WIDTH 21
++#define SRM_CFG_REG_KER_OFST 0x630 // SRAM configuration register
++#define SRM_CFG_REG_OFST 0x630 // SRAM configuration register
++  #define SRAM_OOB_ADR_INTEN_LBN 5
++  #define SRAM_OOB_ADR_INTEN_WIDTH 1
++  #define SRAM_OOB_BUF_INTEN_LBN 4
++  #define SRAM_OOB_BUF_INTEN_WIDTH 1
++  #define SRAM_BT_INIT_EN_LBN 3
++  #define SRAM_BT_INIT_EN_WIDTH 1
++  #define SRM_NUM_BANK_LBN 2
++  #define SRM_NUM_BANK_WIDTH 1
++  #define SRM_BANK_SIZE_LBN 0
++  #define SRM_BANK_SIZE_WIDTH 2
++#define BUF_TBL_UPD_REG_KER_OFST 0x650 // Buffer table update register
++#define BUF_TBL_UPD_REG_OFST 0x650 // Buffer table update register
++  #define BUF_UPD_CMD_LBN 63
++  #define BUF_UPD_CMD_WIDTH 1
++  #define BUF_CLR_CMD_LBN 62
++  #define BUF_CLR_CMD_WIDTH 1
++  #define BUF_CLR_END_ID_LBN 32
++  #define BUF_CLR_END_ID_WIDTH 20
++  #define BUF_CLR_START_ID_LBN 0
++  #define BUF_CLR_START_ID_WIDTH 20
++#define SRM_UPD_EVQ_REG_KER_OFST 0x660 // Buffer table update register
++#define SRM_UPD_EVQ_REG_OFST 0x660 // Buffer table update register
++  #define SRM_UPD_EVQ_ID_LBN 0
++  #define SRM_UPD_EVQ_ID_WIDTH 12
++#define SRAM_PARITY_REG_KER_OFST 0x670 // SRAM parity register.
++#define SRAM_PARITY_REG_OFST 0x670 // SRAM parity register.
++  #define FORCE_SRAM_PERR_LBN 0
++  #define FORCE_SRAM_PERR_WIDTH 1
++
++#if EFVI_FALCON_EXTENDED_P_BAR
++#define BUF_HALF_TBL_KER_OFST 0x18000 // Buffer table in half buffer table mode direct access by kernel driver
++#else
++#define BUF_HALF_TBL_KER_OFST 0x8000 // Buffer table in half buffer table mode direct access by kernel driver
++#endif
++
++
++#define BUF_HALF_TBL_OFST 0x800000 // Buffer table in half buffer table mode direct access by char driver
++  #define BUF_ADR_HBUF_ODD_LBN 44
++  #define BUF_ADR_HBUF_ODD_WIDTH 20
++  #define BUF_OWNER_ID_HBUF_ODD_LBN 32
++  #define BUF_OWNER_ID_HBUF_ODD_WIDTH 12
++  #define BUF_ADR_HBUF_EVEN_LBN 12
++  #define BUF_ADR_HBUF_EVEN_WIDTH 20
++  #define BUF_OWNER_ID_HBUF_EVEN_LBN 0
++  #define BUF_OWNER_ID_HBUF_EVEN_WIDTH 12
++
++
++#if EFVI_FALCON_EXTENDED_P_BAR
++#define BUF_FULL_TBL_KER_OFST 0x18000 // Buffer table in full buffer table mode direct access by kernel driver
++#else
++#define BUF_FULL_TBL_KER_OFST 0x8000 // Buffer table in full buffer table mode direct access by kernel driver
++#endif
++
++
++
++
++#define BUF_FULL_TBL_OFST 0x800000 // Buffer table in full buffer table mode direct access by char driver
++  #define IP_DAT_BUF_SIZE_LBN 50
++  #define IP_DAT_BUF_SIZE_WIDTH 1
++  #define BUF_ADR_REGION_LBN 48
++  #define BUF_ADR_REGION_WIDTH 2
++  #define BUF_ADR_FBUF_LBN 14
++  #define BUF_ADR_FBUF_WIDTH 34
++  #define BUF_OWNER_ID_FBUF_LBN 0
++  #define BUF_OWNER_ID_FBUF_WIDTH 14
++#define SRM_DBG_REG_OFST 0x3000000 // SRAM debug access
++  #define SRM_DBG_LBN 0
++  #define SRM_DBG_WIDTH 64
++//////////////---- RX Datapath Registers C Header ----//////////////
++
++#define RX_CFG_REG_KER_OFST 0x800 // Receive configuration register
++#define RX_CFG_REG_OFST 0x800 // Receive configuration register
++
++#if !defined(FALCON_64K_RXFIFO) && !defined(FALCON_PRE_02020029)
++# if !defined(FALCON_128K_RXFIFO)
++#  define FALCON_128K_RXFIFO
++# endif
++#endif
++
++#if defined(FALCON_128K_RXFIFO)
++
++/* new for B0 */
++  #define RX_TOEP_TCP_SUPPRESS_B0_LBN 48
++  #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
++  #define RX_INGR_EN_B0_LBN 47
++  #define RX_INGR_EN_B0_WIDTH 1
++  #define RX_TOEP_IPV4_B0_LBN 46
++  #define RX_TOEP_IPV4_B0_WIDTH 1
++  #define RX_HASH_ALG_B0_LBN 45
++  #define RX_HASH_ALG_B0_WIDTH 1
++  #define RX_HASH_INSERT_HDR_B0_LBN 44
++  #define RX_HASH_INSERT_HDR_B0_WIDTH 1
++/* moved for B0 */
++  #define RX_DESC_PUSH_EN_B0_LBN 43
++  #define RX_DESC_PUSH_EN_B0_WIDTH 1
++  #define RX_RDW_PATCH_EN_LBN 42 /* Non head of line blocking */
++  #define RX_RDW_PATCH_EN_WIDTH 1
++  #define RX_PCI_BURST_SIZE_B0_LBN 39
++  #define RX_PCI_BURST_SIZE_B0_WIDTH 3
++  #define RX_OWNERR_CTL_B0_LBN 38
++  #define RX_OWNERR_CTL_B0_WIDTH 1
++  #define RX_XON_TX_TH_B0_LBN 33 
++  #define RX_XON_TX_TH_B0_WIDTH 5
++  #define RX_XOFF_TX_TH_B0_LBN 28 
++  #define RX_XOFF_TX_TH_B0_WIDTH 5
++  #define RX_USR_BUF_SIZE_B0_LBN 19
++  #define RX_USR_BUF_SIZE_B0_WIDTH 9
++  #define RX_XON_MAC_TH_B0_LBN 10
++  #define RX_XON_MAC_TH_B0_WIDTH 9
++  #define RX_XOFF_MAC_TH_B0_LBN 1
++  #define RX_XOFF_MAC_TH_B0_WIDTH 9
++  #define RX_XOFF_MAC_EN_B0_LBN 0
++  #define RX_XOFF_MAC_EN_B0_WIDTH 1
++
++#elif !defined(FALCON_PRE_02020029)
++/* new for B0 */
++  #define RX_TOEP_TCP_SUPPRESS_B0_LBN 46
++  #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
++  #define RX_INGR_EN_B0_LBN 45
++  #define RX_INGR_EN_B0_WIDTH 1
++  #define RX_TOEP_IPV4_B0_LBN 44
++  #define RX_TOEP_IPV4_B0_WIDTH 1
++  #define RX_HASH_ALG_B0_LBN 43
++  #define RX_HASH_ALG_B0_WIDTH 41
++  #define RX_HASH_INSERT_HDR_B0_LBN 42
++  #define RX_HASH_INSERT_HDR_B0_WIDTH 1
++/* moved for B0 */
++  #define RX_DESC_PUSH_EN_B0_LBN 41
++  #define RX_DESC_PUSH_EN_B0_WIDTH 1
++  #define RX_PCI_BURST_SIZE_B0_LBN 37
++  #define RX_PCI_BURST_SIZE_B0_WIDTH 3
++  #define RX_OWNERR_CTL_B0_LBN 36
++  #define RX_OWNERR_CTL_B0_WIDTH 1
++  #define RX_XON_TX_TH_B0_LBN 31
++  #define RX_XON_TX_TH_B0_WIDTH 5
++  #define RX_XOFF_TX_TH_B0_LBN 26
++  #define RX_XOFF_TX_TH_B0_WIDTH 5
++  #define RX_USR_BUF_SIZE_B0_LBN 17
++  #define RX_USR_BUF_SIZE_B0_WIDTH 9
++  #define RX_XON_MAC_TH_B0_LBN 9
++  #define RX_XON_MAC_TH_B0_WIDTH 8
++  #define RX_XOFF_MAC_TH_B0_LBN 1
++  #define RX_XOFF_MAC_TH_B0_WIDTH 8
++  #define RX_XOFF_MAC_EN_B0_LBN 0
++  #define RX_XOFF_MAC_EN_B0_WIDTH 1
++
++#else
++/* new for B0 */
++  #define RX_TOEP_TCP_SUPPRESS_B0_LBN 44
++  #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
++  #define RX_INGR_EN_B0_LBN 43
++  #define RX_INGR_EN_B0_WIDTH 1
++  #define RX_TOEP_IPV4_B0_LBN 42
++  #define RX_TOEP_IPV4_B0_WIDTH 1
++  #define RX_HASH_ALG_B0_LBN 41
++  #define RX_HASH_ALG_B0_WIDTH 41
++  #define RX_HASH_INSERT_HDR_B0_LBN 40
++  #define RX_HASH_INSERT_HDR_B0_WIDTH 1
++/* moved for B0 */
++  #define RX_DESC_PUSH_EN_B0_LBN 35
++  #define RX_DESC_PUSH_EN_B0_WIDTH 1
++  #define RX_PCI_BURST_SIZE_B0_LBN 35
++  #define RX_PCI_BURST_SIZE_B0_WIDTH 2
++  #define RX_OWNERR_CTL_B0_LBN 34
++  #define RX_OWNERR_CTL_B0_WIDTH 1
++  #define RX_XON_TX_TH_B0_LBN 29
++  #define RX_XON_TX_TH_B0_WIDTH 5
++  #define RX_XOFF_TX_TH_B0_LBN 24
++  #define RX_XOFF_TX_TH_B0_WIDTH 5
++  #define RX_USR_BUF_SIZE_B0_LBN 15
++  #define RX_USR_BUF_SIZE_B0_WIDTH 9
++  #define RX_XON_MAC_TH_B0_LBN 8
++  #define RX_XON_MAC_TH_B0_WIDTH 7
++  #define RX_XOFF_MAC_TH_B0_LBN 1
++  #define RX_XOFF_MAC_TH_B0_WIDTH 7
++  #define RX_XOFF_MAC_EN_B0_LBN 0
++  #define RX_XOFF_MAC_EN_B0_WIDTH 1
++
++#endif
++
++/* A0/A1 */
++  #define RX_PUSH_EN_A1_LBN 35
++  #define RX_PUSH_EN_A1_WIDTH 1
++  #define RX_PCI_BURST_SIZE_A1_LBN 31
++  #define RX_PCI_BURST_SIZE_A1_WIDTH 3
++  #define RX_OWNERR_CTL_A1_LBN 30
++  #define RX_OWNERR_CTL_A1_WIDTH 1
++  #define RX_XON_TX_TH_A1_LBN 25
++  #define RX_XON_TX_TH_A1_WIDTH 5
++  #define RX_XOFF_TX_TH_A1_LBN 20
++  #define RX_XOFF_TX_TH_A1_WIDTH 5
++  #define RX_USR_BUF_SIZE_A1_LBN 11
++  #define RX_USR_BUF_SIZE_A1_WIDTH 9
++  #define RX_XON_MAC_TH_A1_LBN 6
++  #define RX_XON_MAC_TH_A1_WIDTH 5
++  #define RX_XOFF_MAC_TH_A1_LBN 1
++  #define RX_XOFF_MAC_TH_A1_WIDTH 5
++  #define RX_XOFF_MAC_EN_A1_LBN 0
++  #define RX_XOFF_MAC_EN_A1_WIDTH 1
++
++#define RX_FILTER_CTL_REG_OFST 0x810 // Receive filter control registers
++  #define SCATTER_ENBL_NO_MATCH_Q_B0_LBN 40
++  #define SCATTER_ENBL_NO_MATCH_Q_B0_WIDTH 1
++  #define UDP_FULL_SRCH_LIMIT_LBN 32
++  #define UDP_FULL_SRCH_LIMIT_WIDTH 8
++  #define NUM_KER_LBN 24
++  #define NUM_KER_WIDTH 2
++  #define UDP_WILD_SRCH_LIMIT_LBN 16
++  #define UDP_WILD_SRCH_LIMIT_WIDTH 8
++  #define TCP_WILD_SRCH_LIMIT_LBN 8
++  #define TCP_WILD_SRCH_LIMIT_WIDTH 8
++  #define TCP_FULL_SRCH_LIMIT_LBN 0
++  #define TCP_FULL_SRCH_LIMIT_WIDTH 8
++#define RX_FLUSH_DESCQ_REG_KER_OFST 0x820 // Receive flush descriptor queue register
++#define RX_FLUSH_DESCQ_REG_OFST 0x820 // Receive flush descriptor queue register
++  #define RX_FLUSH_DESCQ_CMD_LBN 24
++  #define RX_FLUSH_DESCQ_CMD_WIDTH 1
++  #define RX_FLUSH_EVQ_ID_LBN 12
++  #define RX_FLUSH_EVQ_ID_WIDTH 12
++  #define RX_FLUSH_DESCQ_LBN 0
++  #define RX_FLUSH_DESCQ_WIDTH 12
++#define RX_DESC_UPD_REG_KER_OFST 0x830 // Kernel  receive descriptor update register. Page-mapped
++#define RX_DESC_UPD_REG_PAGE4_OFST 0x8830 // Char & user receive descriptor update register. Page-mapped. For lowest 1K queues.
++#define RX_DESC_UPD_REG_PAGE123K_OFST 0x1000830 // Char & user receive descriptor update register. Page-mapped. For upper 3K queues.
++  #define RX_DESC_WPTR_LBN 96
++  #define RX_DESC_WPTR_WIDTH 12
++  #define RX_DESC_PUSH_CMD_LBN 95
++  #define RX_DESC_PUSH_CMD_WIDTH 1
++  #define RX_DESC_LBN 0
++  #define RX_DESC_WIDTH 64
++  #define RX_KER_DESC_LBN 0
++  #define RX_KER_DESC_WIDTH 64
++  #define RX_USR_DESC_LBN 0
++  #define RX_USR_DESC_WIDTH 32
++#define RX_DC_CFG_REG_KER_OFST 0x840 // Receive descriptor cache configuration register
++#define RX_DC_CFG_REG_OFST 0x840 // Receive descriptor cache configuration register
++  #define RX_DC_SIZE_LBN 0
++  #define RX_DC_SIZE_WIDTH 2
++#define RX_DC_PF_WM_REG_KER_OFST 0x850 // Receive descriptor cache pre-fetch watermark register
++#define RX_DC_PF_WM_REG_OFST 0x850 // Receive descriptor cache pre-fetch watermark register
++  #define RX_DC_PF_LWM_LO_LBN 0
++  #define RX_DC_PF_LWM_LO_WIDTH 6
++
++#define RX_RSS_TKEY_B0_OFST 0x860 // RSS Toeplitz hash key (B0 only)
++
++#define RX_NODESC_DROP_REG 0x880
++  #define RX_NODESC_DROP_CNT_LBN 0
++  #define RX_NODESC_DROP_CNT_WIDTH 16
++
++#define XM_TX_CFG_REG_OFST 0x1230
++  #define XM_AUTO_PAD_LBN 5
++  #define XM_AUTO_PAD_WIDTH 1
++
++#define RX_FILTER_TBL0_OFST 0xF00000 // Receive filter table - even entries
++  #define RSS_EN_0_B0_LBN 110
++  #define RSS_EN_0_B0_WIDTH 1
++  #define SCATTER_EN_0_B0_LBN 109
++  #define SCATTER_EN_0_B0_WIDTH 1
++  #define TCP_UDP_0_LBN 108
++  #define TCP_UDP_0_WIDTH 1
++  #define RXQ_ID_0_LBN 96
++  #define RXQ_ID_0_WIDTH 12
++  #define DEST_IP_0_LBN 64
++  #define DEST_IP_0_WIDTH 32
++  #define DEST_PORT_TCP_0_LBN 48
++  #define DEST_PORT_TCP_0_WIDTH 16
++  #define SRC_IP_0_LBN 16
++  #define SRC_IP_0_WIDTH 32
++  #define SRC_TCP_DEST_UDP_0_LBN 0
++  #define SRC_TCP_DEST_UDP_0_WIDTH 16
++#define RX_FILTER_TBL1_OFST 0xF00010 // Receive filter table - odd entries
++  #define RSS_EN_1_B0_LBN 110
++  #define RSS_EN_1_B0_WIDTH 1
++  #define SCATTER_EN_1_B0_LBN 109
++  #define SCATTER_EN_1_B0_WIDTH 1
++  #define TCP_UDP_1_LBN 108
++  #define TCP_UDP_1_WIDTH 1
++  #define RXQ_ID_1_LBN 96
++  #define RXQ_ID_1_WIDTH 12
++  #define DEST_IP_1_LBN 64
++  #define DEST_IP_1_WIDTH 32
++  #define DEST_PORT_TCP_1_LBN 48
++  #define DEST_PORT_TCP_1_WIDTH 16
++  #define SRC_IP_1_LBN 16
++  #define SRC_IP_1_WIDTH 32
++  #define SRC_TCP_DEST_UDP_1_LBN 0
++  #define SRC_TCP_DEST_UDP_1_WIDTH 16
++
++#if EFVI_FALCON_EXTENDED_P_BAR
++#define RX_DESC_PTR_TBL_KER_OFST 0x11800 // Receive descriptor pointer kernel access
++#else
++#define RX_DESC_PTR_TBL_KER_OFST 0x1800 // Receive descriptor pointer kernel access
++#endif
++
++
++#define RX_DESC_PTR_TBL_OFST 0xF40000 // Receive descriptor pointer table
++  #define RX_ISCSI_DDIG_EN_LBN 88
++  #define RX_ISCSI_DDIG_EN_WIDTH 1
++  #define RX_ISCSI_HDIG_EN_LBN 87
++  #define RX_ISCSI_HDIG_EN_WIDTH 1
++  #define RX_DESC_PREF_ACT_LBN 86
++  #define RX_DESC_PREF_ACT_WIDTH 1
++  #define RX_DC_HW_RPTR_LBN 80
++  #define RX_DC_HW_RPTR_WIDTH 6
++  #define RX_DESCQ_HW_RPTR_LBN 68
++  #define RX_DESCQ_HW_RPTR_WIDTH 12
++  #define RX_DESCQ_SW_WPTR_LBN 56
++  #define RX_DESCQ_SW_WPTR_WIDTH 12
++  #define RX_DESCQ_BUF_BASE_ID_LBN 36
++  #define RX_DESCQ_BUF_BASE_ID_WIDTH 20
++  #define RX_DESCQ_EVQ_ID_LBN 24
++  #define RX_DESCQ_EVQ_ID_WIDTH 12
++  #define RX_DESCQ_OWNER_ID_LBN 10
++  #define RX_DESCQ_OWNER_ID_WIDTH 14
++  #define RX_DESCQ_LABEL_LBN 5
++  #define RX_DESCQ_LABEL_WIDTH 5
++  #define RX_DESCQ_SIZE_LBN 3
++  #define RX_DESCQ_SIZE_WIDTH 2
++  #define RX_DESCQ_TYPE_LBN 2
++  #define RX_DESCQ_TYPE_WIDTH 1
++  #define RX_DESCQ_JUMBO_LBN 1
++  #define RX_DESCQ_JUMBO_WIDTH 1
++  #define RX_DESCQ_EN_LBN 0
++  #define RX_DESCQ_EN_WIDTH 1
++
++
++#define RX_RSS_INDIR_TBL_B0_OFST 0xFB0000 // RSS indirection table (B0 only)
++  #define RX_RSS_INDIR_ENT_B0_LBN 0
++  #define RX_RSS_INDIR_ENT_B0_WIDTH 6
++
++//////////////---- TX Datapath Registers C Header ----//////////////
++#define TX_FLUSH_DESCQ_REG_KER_OFST 0xA00 // Transmit flush descriptor queue register
++#define TX_FLUSH_DESCQ_REG_OFST 0xA00 // Transmit flush descriptor queue register
++  #define TX_FLUSH_DESCQ_CMD_LBN 12
++  #define TX_FLUSH_DESCQ_CMD_WIDTH 1
++  #define TX_FLUSH_DESCQ_LBN 0
++  #define TX_FLUSH_DESCQ_WIDTH 12
++#define TX_DESC_UPD_REG_KER_OFST 0xA10 // Kernel transmit descriptor update register. Page-mapped
++#define TX_DESC_UPD_REG_PAGE4_OFST 0x8A10 // Char & user transmit descriptor update register. Page-mapped
++#define TX_DESC_UPD_REG_PAGE123K_OFST 0x1000A10 // Char & user transmit descriptor update register. Page-mapped
++  #define TX_DESC_WPTR_LBN 96
++  #define TX_DESC_WPTR_WIDTH 12
++  #define TX_DESC_PUSH_CMD_LBN 95
++  #define TX_DESC_PUSH_CMD_WIDTH 1
++  #define TX_DESC_LBN 0
++  #define TX_DESC_WIDTH 95
++  #define TX_KER_DESC_LBN 0
++  #define TX_KER_DESC_WIDTH 64
++  #define TX_USR_DESC_LBN 0
++  #define TX_USR_DESC_WIDTH 64
++#define TX_DC_CFG_REG_KER_OFST 0xA20 // Transmit descriptor cache configuration register
++#define TX_DC_CFG_REG_OFST 0xA20 // Transmit descriptor cache configuration register
++  #define TX_DC_SIZE_LBN 0
++  #define TX_DC_SIZE_WIDTH 2
++
++#if EFVI_FALCON_EXTENDED_P_BAR
++#define TX_DESC_PTR_TBL_KER_OFST 0x11900 // Transmit descriptor pointer.
++#else
++#define TX_DESC_PTR_TBL_KER_OFST 0x1900 // Transmit descriptor pointer.
++#endif
++
++
++#define TX_DESC_PTR_TBL_OFST 0xF50000 // Transmit descriptor pointer
++  #define TX_NON_IP_DROP_DIS_B0_LBN 91
++  #define TX_NON_IP_DROP_DIS_B0_WIDTH 1
++  #define TX_IP_CHKSM_DIS_B0_LBN 90
++  #define TX_IP_CHKSM_DIS_B0_WIDTH 1
++  #define TX_TCP_CHKSM_DIS_B0_LBN 89
++  #define TX_TCP_CHKSM_DIS_B0_WIDTH 1
++  #define TX_DESCQ_EN_LBN 88
++  #define TX_DESCQ_EN_WIDTH 1
++  #define TX_ISCSI_DDIG_EN_LBN 87
++  #define TX_ISCSI_DDIG_EN_WIDTH 1
++  #define TX_ISCSI_HDIG_EN_LBN 86
++  #define TX_ISCSI_HDIG_EN_WIDTH 1
++  #define TX_DC_HW_RPTR_LBN 80
++  #define TX_DC_HW_RPTR_WIDTH 6
++  #define TX_DESCQ_HW_RPTR_LBN 68
++  #define TX_DESCQ_HW_RPTR_WIDTH 12
++  #define TX_DESCQ_SW_WPTR_LBN 56
++  #define TX_DESCQ_SW_WPTR_WIDTH 12
++  #define TX_DESCQ_BUF_BASE_ID_LBN 36
++  #define TX_DESCQ_BUF_BASE_ID_WIDTH 20
++  #define TX_DESCQ_EVQ_ID_LBN 24
++  #define TX_DESCQ_EVQ_ID_WIDTH 12
++  #define TX_DESCQ_OWNER_ID_LBN 10
++  #define TX_DESCQ_OWNER_ID_WIDTH 14
++  #define TX_DESCQ_LABEL_LBN 5
++  #define TX_DESCQ_LABEL_WIDTH 5
++  #define TX_DESCQ_SIZE_LBN 3
++  #define TX_DESCQ_SIZE_WIDTH 2
++  #define TX_DESCQ_TYPE_LBN 1
++  #define TX_DESCQ_TYPE_WIDTH 2
++  #define TX_DESCQ_FLUSH_LBN 0
++  #define TX_DESCQ_FLUSH_WIDTH 1
++#define TX_CFG_REG_KER_OFST 0xA50 // Transmit configuration register
++#define TX_CFG_REG_OFST 0xA50 // Transmit configuration register
++  #define TX_IP_ID_P1_OFS_LBN 32
++  #define TX_IP_ID_P1_OFS_WIDTH 15
++  #define TX_IP_ID_P0_OFS_LBN 16
++  #define TX_IP_ID_P0_OFS_WIDTH 15
++  #define TX_TURBO_EN_LBN 3
++  #define TX_TURBO_EN_WIDTH 1 
++  #define TX_OWNERR_CTL_LBN 2
++  #define TX_OWNERR_CTL_WIDTH 2
++  #define TX_NON_IP_DROP_DIS_LBN 1
++  #define TX_NON_IP_DROP_DIS_WIDTH 1
++  #define TX_IP_ID_REP_EN_LBN 0
++  #define TX_IP_ID_REP_EN_WIDTH 1
++#define TX_RESERVED_REG_KER_OFST 0xA80 // Transmit configuration register
++#define TX_RESERVED_REG_OFST 0xA80 // Transmit configuration register
++  #define TX_CSR_PUSH_EN_LBN 89
++  #define TX_CSR_PUSH_EN_WIDTH 1
++  #define TX_RX_SPACER_LBN 64
++  #define TX_RX_SPACER_WIDTH 8
++  #define TX_SW_EV_EN_LBN 59
++  #define TX_SW_EV_EN_WIDTH 1
++  #define TX_RX_SPACER_EN_LBN 57
++  #define TX_RX_SPACER_EN_WIDTH 1
++  #define TX_CSR_PREF_WD_TMR_LBN 24
++  #define TX_CSR_PREF_WD_TMR_WIDTH 16
++  #define TX_CSR_ONLY1TAG_LBN 21
++  #define TX_CSR_ONLY1TAG_WIDTH 1
++  #define TX_PREF_THRESHOLD_LBN 19
++  #define TX_PREF_THRESHOLD_WIDTH 2
++  #define TX_ONE_PKT_PER_Q_LBN 18
++  #define TX_ONE_PKT_PER_Q_WIDTH 1
++  #define TX_DIS_NON_IP_EV_LBN 17
++  #define TX_DIS_NON_IP_EV_WIDTH 1
++  #define TX_DMA_SPACER_LBN 8
++  #define TX_DMA_SPACER_WIDTH 8
++  #define TX_FLUSH_MIN_LEN_EN_B0_LBN 7
++  #define TX_FLUSH_MIN_LEN_EN_B0_WIDTH 1
++  #define TX_TCP_DIS_A1_LBN 7
++  #define TX_TCP_DIS_A1_WIDTH 1
++  #define TX_IP_DIS_A1_LBN 6
++  #define TX_IP_DIS_A1_WIDTH 1
++  #define TX_MAX_CPL_LBN 2
++  #define TX_MAX_CPL_WIDTH 2
++  #define TX_MAX_PREF_LBN 0
++  #define TX_MAX_PREF_WIDTH 2
++#define TX_VLAN_REG_OFST 0xAE0 // Transmit VLAN tag register
++  #define TX_VLAN_EN_LBN 127
++  #define TX_VLAN_EN_WIDTH 1
++  #define TX_VLAN7_PORT1_EN_LBN 125
++  #define TX_VLAN7_PORT1_EN_WIDTH 1
++  #define TX_VLAN7_PORT0_EN_LBN 124
++  #define TX_VLAN7_PORT0_EN_WIDTH 1
++  #define TX_VLAN7_LBN 112
++  #define TX_VLAN7_WIDTH 12
++  #define TX_VLAN6_PORT1_EN_LBN 109
++  #define TX_VLAN6_PORT1_EN_WIDTH 1
++  #define TX_VLAN6_PORT0_EN_LBN 108
++  #define TX_VLAN6_PORT0_EN_WIDTH 1
++  #define TX_VLAN6_LBN 96
++  #define TX_VLAN6_WIDTH 12
++  #define TX_VLAN5_PORT1_EN_LBN 93
++  #define TX_VLAN5_PORT1_EN_WIDTH 1
++  #define TX_VLAN5_PORT0_EN_LBN 92
++  #define TX_VLAN5_PORT0_EN_WIDTH 1
++  #define TX_VLAN5_LBN 80
++  #define TX_VLAN5_WIDTH 12
++  #define TX_VLAN4_PORT1_EN_LBN 77
++  #define TX_VLAN4_PORT1_EN_WIDTH 1
++  #define TX_VLAN4_PORT0_EN_LBN 76
++  #define TX_VLAN4_PORT0_EN_WIDTH 1
++  #define TX_VLAN4_LBN 64
++  #define TX_VLAN4_WIDTH 12
++  #define TX_VLAN3_PORT1_EN_LBN 61
++  #define TX_VLAN3_PORT1_EN_WIDTH 1
++  #define TX_VLAN3_PORT0_EN_LBN 60
++  #define TX_VLAN3_PORT0_EN_WIDTH 1
++  #define TX_VLAN3_LBN 48
++  #define TX_VLAN3_WIDTH 12
++  #define TX_VLAN2_PORT1_EN_LBN 45
++  #define TX_VLAN2_PORT1_EN_WIDTH 1
++  #define TX_VLAN2_PORT0_EN_LBN 44
++  #define TX_VLAN2_PORT0_EN_WIDTH 1
++  #define TX_VLAN2_LBN 32
++  #define TX_VLAN2_WIDTH 12
++  #define TX_VLAN1_PORT1_EN_LBN 29
++  #define TX_VLAN1_PORT1_EN_WIDTH 1
++  #define TX_VLAN1_PORT0_EN_LBN 28
++  #define TX_VLAN1_PORT0_EN_WIDTH 1
++  #define TX_VLAN1_LBN 16
++  #define TX_VLAN1_WIDTH 12
++  #define TX_VLAN0_PORT1_EN_LBN 13
++  #define TX_VLAN0_PORT1_EN_WIDTH 1
++  #define TX_VLAN0_PORT0_EN_LBN 12
++  #define TX_VLAN0_PORT0_EN_WIDTH 1
++  #define TX_VLAN0_LBN 0
++  #define TX_VLAN0_WIDTH 12
++#define TX_FIL_CTL_REG_OFST 0xAF0 // Transmit filter control register
++  #define TX_MADR1_FIL_EN_LBN 65
++  #define TX_MADR1_FIL_EN_WIDTH 1
++  #define TX_MADR0_FIL_EN_LBN 64
++  #define TX_MADR0_FIL_EN_WIDTH 1
++  #define TX_IPFIL31_PORT1_EN_LBN 63
++  #define TX_IPFIL31_PORT1_EN_WIDTH 1
++  #define TX_IPFIL31_PORT0_EN_LBN 62
++  #define TX_IPFIL31_PORT0_EN_WIDTH 1
++  #define TX_IPFIL30_PORT1_EN_LBN 61
++  #define TX_IPFIL30_PORT1_EN_WIDTH 1
++  #define TX_IPFIL30_PORT0_EN_LBN 60
++  #define TX_IPFIL30_PORT0_EN_WIDTH 1
++  #define TX_IPFIL29_PORT1_EN_LBN 59
++  #define TX_IPFIL29_PORT1_EN_WIDTH 1
++  #define TX_IPFIL29_PORT0_EN_LBN 58
++  #define TX_IPFIL29_PORT0_EN_WIDTH 1
++  #define TX_IPFIL28_PORT1_EN_LBN 57
++  #define TX_IPFIL28_PORT1_EN_WIDTH 1
++  #define TX_IPFIL28_PORT0_EN_LBN 56
++  #define TX_IPFIL28_PORT0_EN_WIDTH 1
++  #define TX_IPFIL27_PORT1_EN_LBN 55
++  #define TX_IPFIL27_PORT1_EN_WIDTH 1
++  #define TX_IPFIL27_PORT0_EN_LBN 54
++  #define TX_IPFIL27_PORT0_EN_WIDTH 1
++  #define TX_IPFIL26_PORT1_EN_LBN 53
++  #define TX_IPFIL26_PORT1_EN_WIDTH 1
++  #define TX_IPFIL26_PORT0_EN_LBN 52
++  #define TX_IPFIL26_PORT0_EN_WIDTH 1
++  #define TX_IPFIL25_PORT1_EN_LBN 51
++  #define TX_IPFIL25_PORT1_EN_WIDTH 1
++  #define TX_IPFIL25_PORT0_EN_LBN 50
++  #define TX_IPFIL25_PORT0_EN_WIDTH 1
++  #define TX_IPFIL24_PORT1_EN_LBN 49
++  #define TX_IPFIL24_PORT1_EN_WIDTH 1
++  #define TX_IPFIL24_PORT0_EN_LBN 48
++  #define TX_IPFIL24_PORT0_EN_WIDTH 1
++  #define TX_IPFIL23_PORT1_EN_LBN 47
++  #define TX_IPFIL23_PORT1_EN_WIDTH 1
++  #define TX_IPFIL23_PORT0_EN_LBN 46
++  #define TX_IPFIL23_PORT0_EN_WIDTH 1
++  #define TX_IPFIL22_PORT1_EN_LBN 45
++  #define TX_IPFIL22_PORT1_EN_WIDTH 1
++  #define TX_IPFIL22_PORT0_EN_LBN 44
++  #define TX_IPFIL22_PORT0_EN_WIDTH 1
++  #define TX_IPFIL21_PORT1_EN_LBN 43
++  #define TX_IPFIL21_PORT1_EN_WIDTH 1
++  #define TX_IPFIL21_PORT0_EN_LBN 42
++  #define TX_IPFIL21_PORT0_EN_WIDTH 1
++  #define TX_IPFIL20_PORT1_EN_LBN 41
++  #define TX_IPFIL20_PORT1_EN_WIDTH 1
++  #define TX_IPFIL20_PORT0_EN_LBN 40
++  #define TX_IPFIL20_PORT0_EN_WIDTH 1
++  #define TX_IPFIL19_PORT1_EN_LBN 39
++  #define TX_IPFIL19_PORT1_EN_WIDTH 1
++  #define TX_IPFIL19_PORT0_EN_LBN 38
++  #define TX_IPFIL19_PORT0_EN_WIDTH 1
++  #define TX_IPFIL18_PORT1_EN_LBN 37
++  #define TX_IPFIL18_PORT1_EN_WIDTH 1
++  #define TX_IPFIL18_PORT0_EN_LBN 36
++  #define TX_IPFIL18_PORT0_EN_WIDTH 1
++  #define TX_IPFIL17_PORT1_EN_LBN 35
++  #define TX_IPFIL17_PORT1_EN_WIDTH 1
++  #define TX_IPFIL17_PORT0_EN_LBN 34
++  #define TX_IPFIL17_PORT0_EN_WIDTH 1
++  #define TX_IPFIL16_PORT1_EN_LBN 33
++  #define TX_IPFIL16_PORT1_EN_WIDTH 1
++  #define TX_IPFIL16_PORT0_EN_LBN 32
++  #define TX_IPFIL16_PORT0_EN_WIDTH 1
++  #define TX_IPFIL15_PORT1_EN_LBN 31
++  #define TX_IPFIL15_PORT1_EN_WIDTH 1
++  #define TX_IPFIL15_PORT0_EN_LBN 30
++  #define TX_IPFIL15_PORT0_EN_WIDTH 1
++  #define TX_IPFIL14_PORT1_EN_LBN 29
++  #define TX_IPFIL14_PORT1_EN_WIDTH 1
++  #define TX_IPFIL14_PORT0_EN_LBN 28
++  #define TX_IPFIL14_PORT0_EN_WIDTH 1
++  #define TX_IPFIL13_PORT1_EN_LBN 27
++  #define TX_IPFIL13_PORT1_EN_WIDTH 1
++  #define TX_IPFIL13_PORT0_EN_LBN 26
++  #define TX_IPFIL13_PORT0_EN_WIDTH 1
++  #define TX_IPFIL12_PORT1_EN_LBN 25
++  #define TX_IPFIL12_PORT1_EN_WIDTH 1
++  #define TX_IPFIL12_PORT0_EN_LBN 24
++  #define TX_IPFIL12_PORT0_EN_WIDTH 1
++  #define TX_IPFIL11_PORT1_EN_LBN 23
++  #define TX_IPFIL11_PORT1_EN_WIDTH 1
++  #define TX_IPFIL11_PORT0_EN_LBN 22
++  #define TX_IPFIL11_PORT0_EN_WIDTH 1
++  #define TX_IPFIL10_PORT1_EN_LBN 21
++  #define TX_IPFIL10_PORT1_EN_WIDTH 1
++  #define TX_IPFIL10_PORT0_EN_LBN 20
++  #define TX_IPFIL10_PORT0_EN_WIDTH 1
++  #define TX_IPFIL9_PORT1_EN_LBN 19
++  #define TX_IPFIL9_PORT1_EN_WIDTH 1
++  #define TX_IPFIL9_PORT0_EN_LBN 18
++  #define TX_IPFIL9_PORT0_EN_WIDTH 1
++  #define TX_IPFIL8_PORT1_EN_LBN 17
++  #define TX_IPFIL8_PORT1_EN_WIDTH 1
++  #define TX_IPFIL8_PORT0_EN_LBN 16
++  #define TX_IPFIL8_PORT0_EN_WIDTH 1
++  #define TX_IPFIL7_PORT1_EN_LBN 15
++  #define TX_IPFIL7_PORT1_EN_WIDTH 1
++  #define TX_IPFIL7_PORT0_EN_LBN 14
++  #define TX_IPFIL7_PORT0_EN_WIDTH 1
++  #define TX_IPFIL6_PORT1_EN_LBN 13
++  #define TX_IPFIL6_PORT1_EN_WIDTH 1
++  #define TX_IPFIL6_PORT0_EN_LBN 12
++  #define TX_IPFIL6_PORT0_EN_WIDTH 1
++  #define TX_IPFIL5_PORT1_EN_LBN 11
++  #define TX_IPFIL5_PORT1_EN_WIDTH 1
++  #define TX_IPFIL5_PORT0_EN_LBN 10
++  #define TX_IPFIL5_PORT0_EN_WIDTH 1
++  #define TX_IPFIL4_PORT1_EN_LBN 9
++  #define TX_IPFIL4_PORT1_EN_WIDTH 1
++  #define TX_IPFIL4_PORT0_EN_LBN 8
++  #define TX_IPFIL4_PORT0_EN_WIDTH 1
++  #define TX_IPFIL3_PORT1_EN_LBN 7
++  #define TX_IPFIL3_PORT1_EN_WIDTH 1
++  #define TX_IPFIL3_PORT0_EN_LBN 6
++  #define TX_IPFIL3_PORT0_EN_WIDTH 1
++  #define TX_IPFIL2_PORT1_EN_LBN 5
++  #define TX_IPFIL2_PORT1_EN_WIDTH 1
++  #define TX_IPFIL2_PORT0_EN_LBN 4
++  #define TX_IPFIL2_PORT0_EN_WIDTH 1
++  #define TX_IPFIL1_PORT1_EN_LBN 3
++  #define TX_IPFIL1_PORT1_EN_WIDTH 1
++  #define TX_IPFIL1_PORT0_EN_LBN 2
++  #define TX_IPFIL1_PORT0_EN_WIDTH 1
++  #define TX_IPFIL0_PORT1_EN_LBN 1
++  #define TX_IPFIL0_PORT1_EN_WIDTH 1
++  #define TX_IPFIL0_PORT0_EN_LBN 0
++  #define TX_IPFIL0_PORT0_EN_WIDTH 1
++#define TX_IPFIL_TBL_OFST 0xB00 // Transmit IP source address filter table
++  #define TX_IPFIL_MASK_LBN 32
++  #define TX_IPFIL_MASK_WIDTH 32
++  #define TX_IP_SRC_ADR_LBN 0
++  #define TX_IP_SRC_ADR_WIDTH 32
++#define TX_PACE_REG_A1_OFST 0xF80000 // Transmit pace control register
++#define TX_PACE_REG_B0_OFST 0xA90    // Transmit pace control register
++  #define TX_PACE_SB_AF_LBN 19
++  #define TX_PACE_SB_AF_WIDTH 10
++  #define TX_PACE_SB_NOTAF_LBN 9
++  #define TX_PACE_SB_NOTAF_WIDTH 10
++  #define TX_PACE_FB_BASE_LBN 5
++  #define TX_PACE_FB_BASE_WIDTH 4
++  #define TX_PACE_BIN_TH_LBN 0
++  #define TX_PACE_BIN_TH_WIDTH 5
++#define TX_PACE_TBL_A1_OFST 0xF80040 // Transmit pacing table
++#define TX_PACE_TBL_FIRST_QUEUE_A1 4
++#define TX_PACE_TBL_B0_OFST 0xF80000 // Transmit pacing table
++#define TX_PACE_TBL_FIRST_QUEUE_B0 0
++  #define TX_PACE_LBN 0
++  #define TX_PACE_WIDTH 5
++
++//////////////---- EE/Flash Registers C Header ----//////////////
++#define EE_SPI_HCMD_REG_KER_OFST 0x100 // SPI host command register
++#define EE_SPI_HCMD_REG_OFST 0x100 // SPI host command register
++  #define EE_SPI_HCMD_CMD_EN_LBN 31
++  #define EE_SPI_HCMD_CMD_EN_WIDTH 1
++  #define EE_WR_TIMER_ACTIVE_LBN 28
++  #define EE_WR_TIMER_ACTIVE_WIDTH 1
++  #define EE_SPI_HCMD_SF_SEL_LBN 24
++  #define EE_SPI_HCMD_SF_SEL_WIDTH 1
++  #define EE_SPI_HCMD_DABCNT_LBN 16
++  #define EE_SPI_HCMD_DABCNT_WIDTH 5
++  #define EE_SPI_HCMD_READ_LBN 15
++  #define EE_SPI_HCMD_READ_WIDTH 1
++  #define EE_SPI_HCMD_DUBCNT_LBN 12
++  #define EE_SPI_HCMD_DUBCNT_WIDTH 2
++  #define EE_SPI_HCMD_ADBCNT_LBN 8
++  #define EE_SPI_HCMD_ADBCNT_WIDTH 2
++  #define EE_SPI_HCMD_ENC_LBN 0
++  #define EE_SPI_HCMD_ENC_WIDTH 8
++#define EE_SPI_HADR_REG_KER_OFST 0X110 // SPI host address register
++#define EE_SPI_HADR_REG_OFST 0X110 // SPI host address register
++  #define EE_SPI_HADR_DUBYTE_LBN 24
++  #define EE_SPI_HADR_DUBYTE_WIDTH 8
++  #define EE_SPI_HADR_ADR_LBN 0
++  #define EE_SPI_HADR_ADR_WIDTH 24
++#define EE_SPI_HDATA_REG_KER_OFST 0x120 // SPI host data register
++#define EE_SPI_HDATA_REG_OFST 0x120 // SPI host data register
++  #define EE_SPI_HDATA3_LBN 96
++  #define EE_SPI_HDATA3_WIDTH 32
++  #define EE_SPI_HDATA2_LBN 64
++  #define EE_SPI_HDATA2_WIDTH 32
++  #define EE_SPI_HDATA1_LBN 32
++  #define EE_SPI_HDATA1_WIDTH 32
++  #define EE_SPI_HDATA0_LBN 0
++  #define EE_SPI_HDATA0_WIDTH 32
++#define EE_BASE_PAGE_REG_KER_OFST 0x130 // Expansion ROM base mirror register
++#define EE_BASE_PAGE_REG_OFST 0x130 // Expansion ROM base mirror register
++  #define EE_EXP_ROM_WINDOW_BASE_LBN 16
++  #define EE_EXP_ROM_WINDOW_BASE_WIDTH 13
++  #define EE_EXPROM_MASK_LBN 0
++  #define EE_EXPROM_MASK_WIDTH 13
++#define EE_VPD_CFG0_REG_KER_OFST 0X140 // SPI/VPD configuration register
++#define EE_VPD_CFG0_REG_OFST 0X140 // SPI/VPD configuration register
++  #define EE_SF_FASTRD_EN_LBN 127
++  #define EE_SF_FASTRD_EN_WIDTH 1
++  #define EE_SF_CLOCK_DIV_LBN 120
++  #define EE_SF_CLOCK_DIV_WIDTH 7
++  #define EE_VPD_WIP_POLL_LBN 119
++  #define EE_VPD_WIP_POLL_WIDTH 1
++  #define EE_VPDW_LENGTH_LBN 80
++  #define EE_VPDW_LENGTH_WIDTH 15
++  #define EE_VPDW_BASE_LBN 64
++  #define EE_VPDW_BASE_WIDTH 15
++  #define EE_VPD_WR_CMD_EN_LBN 56
++  #define EE_VPD_WR_CMD_EN_WIDTH 8
++  #define EE_VPD_BASE_LBN 32
++  #define EE_VPD_BASE_WIDTH 24
++  #define EE_VPD_LENGTH_LBN 16
++  #define EE_VPD_LENGTH_WIDTH 13
++  #define EE_VPD_AD_SIZE_LBN 8
++  #define EE_VPD_AD_SIZE_WIDTH 5
++  #define EE_VPD_ACCESS_ON_LBN 5
++  #define EE_VPD_ACCESS_ON_WIDTH 1
++#define EE_VPD_SW_CNTL_REG_KER_OFST 0X150 // VPD access SW control register
++#define EE_VPD_SW_CNTL_REG_OFST 0X150 // VPD access SW control register
++  #define EE_VPD_CYCLE_PENDING_LBN 31
++  #define EE_VPD_CYCLE_PENDING_WIDTH 1
++  #define EE_VPD_CYC_WRITE_LBN 28
++  #define EE_VPD_CYC_WRITE_WIDTH 1
++  #define EE_VPD_CYC_ADR_LBN 0
++  #define EE_VPD_CYC_ADR_WIDTH 15
++#define EE_VPD_SW_DATA_REG_KER_OFST 0x160 // VPD access SW data register
++#define EE_VPD_SW_DATA_REG_OFST 0x160 // VPD access SW data register
++  #define EE_VPD_CYC_DAT_LBN 0
++  #define EE_VPD_CYC_DAT_WIDTH 32
diff --cc drivers/xen/sfc_netfront/ef_vi_falcon_desc.h

index 0000000,0000000..8c8404c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon_desc.h
@@@ -1,0 -1,0 +1,43 @@@
++//////////////---- Descriptors C Headers ----//////////////
++// Receive Kernel IP Descriptor
++  #define RX_KER_BUF_SIZE_LBN 48
++  #define RX_KER_BUF_SIZE_WIDTH 14
++  #define RX_KER_BUF_REGION_LBN 46
++  #define RX_KER_BUF_REGION_WIDTH 2
++      #define RX_KER_BUF_REGION0_DECODE 0
++      #define RX_KER_BUF_REGION1_DECODE 1
++      #define RX_KER_BUF_REGION2_DECODE 2
++      #define RX_KER_BUF_REGION3_DECODE 3
++  #define RX_KER_BUF_ADR_LBN 0
++  #define RX_KER_BUF_ADR_WIDTH 46
++// Receive User IP Descriptor
++  #define RX_USR_2BYTE_OFS_LBN 20
++  #define RX_USR_2BYTE_OFS_WIDTH 12
++  #define RX_USR_BUF_ID_LBN 0
++  #define RX_USR_BUF_ID_WIDTH 20
++// Transmit Kernel IP Descriptor
++  #define TX_KER_PORT_LBN 63
++  #define TX_KER_PORT_WIDTH 1
++  #define TX_KER_CONT_LBN 62
++  #define TX_KER_CONT_WIDTH 1
++  #define TX_KER_BYTE_CNT_LBN 48
++  #define TX_KER_BYTE_CNT_WIDTH 14
++  #define TX_KER_BUF_REGION_LBN 46
++  #define TX_KER_BUF_REGION_WIDTH 2
++      #define TX_KER_BUF_REGION0_DECODE 0
++      #define TX_KER_BUF_REGION1_DECODE 1
++      #define TX_KER_BUF_REGION2_DECODE 2
++      #define TX_KER_BUF_REGION3_DECODE 3
++  #define TX_KER_BUF_ADR_LBN 0
++  #define TX_KER_BUF_ADR_WIDTH 46
++// Transmit User IP Descriptor
++  #define TX_USR_PORT_LBN 47
++  #define TX_USR_PORT_WIDTH 1
++  #define TX_USR_CONT_LBN 46
++  #define TX_USR_CONT_WIDTH 1
++  #define TX_USR_BYTE_CNT_LBN 33
++  #define TX_USR_BYTE_CNT_WIDTH 13
++  #define TX_USR_BUF_ID_LBN 13
++  #define TX_USR_BUF_ID_WIDTH 20
++  #define TX_USR_BYTE_OFS_LBN 0
++  #define TX_USR_BYTE_OFS_WIDTH 13
diff --cc drivers/xen/sfc_netfront/ef_vi_falcon_event.h

index 0000000,0000000..abb63f3

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon_event.h
@@@ -1,0 -1,0 +1,123 @@@
++//////////////---- Events Format C Header ----//////////////
++//////////////---- Event entry ----//////////////
++  #define EV_CODE_LBN 60
++  #define EV_CODE_WIDTH 4
++      #define RX_IP_EV_DECODE 0
++      #define TX_IP_EV_DECODE 2
++      #define DRIVER_EV_DECODE 5
++      #define GLOBAL_EV_DECODE 6
++      #define DRV_GEN_EV_DECODE 7
++  #define EV_DATA_LBN 0
++  #define EV_DATA_WIDTH 60
++//////////////---- Receive IP events for both Kernel & User event queues ----//////////////
++  #define RX_EV_PKT_OK_LBN 56
++  #define RX_EV_PKT_OK_WIDTH 1
++  #define RX_EV_BUF_OWNER_ID_ERR_LBN 54
++  #define RX_EV_BUF_OWNER_ID_ERR_WIDTH 1
++  #define RX_EV_IP_HDR_CHKSUM_ERR_LBN 52
++  #define RX_EV_IP_HDR_CHKSUM_ERR_WIDTH 1
++  #define RX_EV_TCP_UDP_CHKSUM_ERR_LBN 51
++  #define RX_EV_TCP_UDP_CHKSUM_ERR_WIDTH 1
++  #define RX_EV_ETH_CRC_ERR_LBN 50
++  #define RX_EV_ETH_CRC_ERR_WIDTH 1
++  #define RX_EV_FRM_TRUNC_LBN 49
++  #define RX_EV_FRM_TRUNC_WIDTH 1
++  #define RX_EV_DRIB_NIB_LBN 48
++  #define RX_EV_DRIB_NIB_WIDTH 1
++  #define RX_EV_TOBE_DISC_LBN 47
++  #define RX_EV_TOBE_DISC_WIDTH 1
++  #define RX_EV_PKT_TYPE_LBN 44
++  #define RX_EV_PKT_TYPE_WIDTH 3
++      #define RX_EV_PKT_TYPE_ETH_DECODE 0
++      #define RX_EV_PKT_TYPE_LLC_DECODE 1
++      #define RX_EV_PKT_TYPE_JUMBO_DECODE 2
++      #define RX_EV_PKT_TYPE_VLAN_DECODE 3
++      #define RX_EV_PKT_TYPE_VLAN_LLC_DECODE 4
++      #define RX_EV_PKT_TYPE_VLAN_JUMBO_DECODE 5
++  #define RX_EV_HDR_TYPE_LBN 42
++  #define RX_EV_HDR_TYPE_WIDTH 2
++      #define RX_EV_HDR_TYPE_TCP_IPV4_DECODE 0
++      #define RX_EV_HDR_TYPE_UDP_IPV4_DECODE 1
++      #define RX_EV_HDR_TYPE_OTHER_IP_DECODE 2
++      #define RX_EV_HDR_TYPE_NON_IP_DECODE 3
++  #define RX_EV_DESC_Q_EMPTY_LBN 41
++  #define RX_EV_DESC_Q_EMPTY_WIDTH 1
++  #define RX_EV_MCAST_HASH_MATCH_LBN 40
++  #define RX_EV_MCAST_HASH_MATCH_WIDTH 1
++  #define RX_EV_MCAST_PKT_LBN 39
++  #define RX_EV_MCAST_PKT_WIDTH 1
++  #define RX_EV_Q_LABEL_LBN 32
++  #define RX_EV_Q_LABEL_WIDTH 5
++  #define RX_JUMBO_CONT_LBN 31
++  #define RX_JUMBO_CONT_WIDTH 1
++  #define RX_SOP_LBN 15
++  #define RX_SOP_WIDTH 1
++  #define RX_PORT_LBN 30
++  #define RX_PORT_WIDTH 1
++  #define RX_EV_BYTE_CNT_LBN 16
++  #define RX_EV_BYTE_CNT_WIDTH 14
++  #define RX_iSCSI_PKT_OK_LBN 14
++  #define RX_iSCSI_PKT_OK_WIDTH 1
++  #define RX_ISCSI_DDIG_ERR_LBN 13
++  #define RX_ISCSI_DDIG_ERR_WIDTH 1
++  #define RX_ISCSI_HDIG_ERR_LBN 12
++  #define RX_ISCSI_HDIG_ERR_WIDTH 1
++  #define RX_EV_DESC_PTR_LBN 0
++  #define RX_EV_DESC_PTR_WIDTH 12
++//////////////---- Transmit IP events for both Kernel & User event queues ----//////////////
++  #define TX_EV_PKT_ERR_LBN 38
++  #define TX_EV_PKT_ERR_WIDTH 1
++  #define TX_EV_PKT_TOO_BIG_LBN 37
++  #define TX_EV_PKT_TOO_BIG_WIDTH 1
++  #define TX_EV_Q_LABEL_LBN 32
++  #define TX_EV_Q_LABEL_WIDTH 5
++  #define TX_EV_PORT_LBN 16
++  #define TX_EV_PORT_WIDTH 1
++  #define TX_EV_WQ_FF_FULL_LBN 15
++  #define TX_EV_WQ_FF_FULL_WIDTH 1
++  #define TX_EV_BUF_OWNER_ID_ERR_LBN 14
++  #define TX_EV_BUF_OWNER_ID_ERR_WIDTH 1
++  #define TX_EV_COMP_LBN 12
++  #define TX_EV_COMP_WIDTH 1
++  #define TX_EV_DESC_PTR_LBN 0
++  #define TX_EV_DESC_PTR_WIDTH 12
++//////////////---- Char or Kernel driver events ----//////////////
++  #define DRIVER_EV_SUB_CODE_LBN 56
++  #define DRIVER_EV_SUB_CODE_WIDTH 4
++      #define TX_DESCQ_FLS_DONE_EV_DECODE 0x0
++      #define RX_DESCQ_FLS_DONE_EV_DECODE 0x1
++      #define EVQ_INIT_DONE_EV_DECODE 0x2
++      #define EVQ_NOT_EN_EV_DECODE 0x3
++      #define RX_DESCQ_FLSFF_OVFL_EV_DECODE 0x4
++      #define SRM_UPD_DONE_EV_DECODE 0x5
++      #define WAKE_UP_EV_DECODE 0x6
++      #define TX_PKT_NON_TCP_UDP_DECODE 0x9
++      #define TIMER_EV_DECODE 0xA
++      #define RX_DSC_ERROR_EV_DECODE 0xE
++  #define DRIVER_EV_TX_DESCQ_ID_LBN 0
++  #define DRIVER_EV_TX_DESCQ_ID_WIDTH 12
++  #define DRIVER_EV_RX_DESCQ_ID_LBN 0
++  #define DRIVER_EV_RX_DESCQ_ID_WIDTH 12
++  #define DRIVER_EV_EVQ_ID_LBN 0
++  #define DRIVER_EV_EVQ_ID_WIDTH 12
++  #define DRIVER_TMR_ID_LBN 0
++  #define DRIVER_TMR_ID_WIDTH 12
++  #define DRIVER_EV_SRM_UPD_LBN 0
++  #define DRIVER_EV_SRM_UPD_WIDTH 2
++      #define SRM_CLR_EV_DECODE 0
++      #define SRM_UPD_EV_DECODE 1
++      #define SRM_ILLCLR_EV_DECODE 2
++//////////////---- Global events. Sent to both event queue 0 and 4. ----//////////////
++  #define XFP_PHY_INTR_LBN 10
++  #define XFP_PHY_INTR_WIDTH 1
++  #define XG_PHY_INTR_LBN 9
++  #define XG_PHY_INTR_WIDTH 1
++  #define G_PHY1_INTR_LBN 8
++  #define G_PHY1_INTR_WIDTH 1
++  #define G_PHY0_INTR_LBN 7
++  #define G_PHY0_INTR_WIDTH 1
++//////////////---- Driver generated events ----//////////////
++  #define DRV_GEN_EV_CODE_LBN 60
++  #define DRV_GEN_EV_CODE_WIDTH 4
++  #define DRV_GEN_EV_DATA_LBN 0
++  #define DRV_GEN_EV_DATA_WIDTH 60
diff --cc drivers/xen/sfc_netfront/ef_vi_internal.h

index 0000000,0000000..396ae46

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_internal.h
@@@ -1,0 -1,0 +1,256 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * \author  djr
++ *  \brief  Really-and-truely-honestly internal stuff for libef.
++ *   \date  2004/06/13
++ */
++
++/*! \cidoxg_include_ci_ul */
++#ifndef __CI_EF_VI_INTERNAL_H__
++#define __CI_EF_VI_INTERNAL_H__
++
++
++/* These flags share space with enum ef_vi_flags. */
++#define EF_VI_BUG5692_WORKAROUND  0x10000
++
++
++/* ***********************************************************************
++ * COMPILATION CONTROL FLAGS (see ef_vi.h for "workaround" controls)
++ */
++
++#define EF_VI_DO_MAGIC_CHECKS 1
++
++
++/**********************************************************************
++ * Headers
++ */
++
++#include <etherfabric/ef_vi.h>
++#include "sysdep.h"
++#include "ef_vi_falcon.h"
++
++
++/**********************************************************************
++ * Debugging.
++ */
++
++#ifndef NDEBUG
++
++# define _ef_assert(exp, file, line) BUG_ON(!(exp));
++
++# define _ef_assert2(exp, x, y, file, line)  do {     \
++              if (unlikely(!(exp)))           \
++                      BUG();                          \
++      } while (0)
++
++#else
++
++# define _ef_assert(exp, file, line)
++# define _ef_assert2(e, x, y, file, line)
++
++#endif
++
++#define ef_assert(a)          do{ _ef_assert((a),__FILE__,__LINE__); } while(0)
++#define ef_assert_equal(a,b)  _ef_assert2((a)==(b),(a),(b),__FILE__,__LINE__)
++#define ef_assert_eq          ef_assert_equal
++#define ef_assert_lt(a,b)     _ef_assert2((a)<(b),(a),(b),__FILE__,__LINE__)
++#define ef_assert_le(a,b)     _ef_assert2((a)<=(b),(a),(b),__FILE__,__LINE__)
++#define ef_assert_nequal(a,b) _ef_assert2((a)!=(b),(a),(b),__FILE__,__LINE__)
++#define ef_assert_ne          ef_assert_nequal
++#define ef_assert_ge(a,b)     _ef_assert2((a)>=(b),(a),(b),__FILE__,__LINE__)
++#define ef_assert_gt(a,b)     _ef_assert2((a)>(b),(a),(b),__FILE__,__LINE__)
++
++/**********************************************************************
++ * Debug checks. ******************************************************
++ **********************************************************************/
++
++#ifdef NDEBUG
++# define EF_VI_MAGIC_SET(p, type)
++# define EF_VI_CHECK_VI(p)
++# define EF_VI_CHECK_EVENT_Q(p)
++# define EF_VI_CHECK_IOBUFSET(p)
++# define EF_VI_CHECK_FILTER(p)
++# define EF_VI_CHECK_SHMBUF(p)
++# define EF_VI_CHECK_PT_EP(p)
++#else
++# define EF_VI                    0x3
++# define EF_EPLOCK                0x6
++# define EF_IOBUFSET              0x9
++# define EF_FILTER                0xa
++# define EF_SHMBUF                0x11
++
++# define EF_VI_MAGIC(p, type)                         \
++      (((unsigned)(type) << 28) |                     \
++       (((unsigned)(intptr_t)(p)) & 0x0fffffffu))
++
++# if !EF_VI_DO_MAGIC_CHECKS
++#  define EF_VI_MAGIC_SET(p, type)
++#  define EF_VI_MAGIC_CHECK(p, type)
++# else
++#  define EF_VI_MAGIC_SET(p, type)                    \
++      do {                                            \
++              (p)->magic = EF_VI_MAGIC((p), (type));  \
++      } while (0)
++
++# define EF_VI_MAGIC_OKAY(p, type)                      \
++      ((p)->magic == EF_VI_MAGIC((p), (type)))
++
++# define EF_VI_MAGIC_CHECK(p, type)                     \
++      ef_assert(EF_VI_MAGIC_OKAY((p), (type)))
++
++#endif /* EF_VI_DO_MAGIC_CHECKS */
++
++# define EF_VI_CHECK_VI(p)                    \
++      ef_assert(p);                           \
++      EF_VI_MAGIC_CHECK((p), EF_VI);
++
++# define EF_VI_CHECK_EVENT_Q(p)                       \
++      ef_assert(p);                           \
++      EF_VI_MAGIC_CHECK((p), EF_VI);          \
++      ef_assert((p)->evq_base);               \
++      ef_assert((p)->evq_mask);
++
++# define EF_VI_CHECK_PT_EP(p)                 \
++      ef_assert(p);                           \
++      EF_VI_MAGIC_CHECK((p), EF_VI);          \
++      ef_assert((p)->ep_state);
++
++# define EF_VI_CHECK_IOBUFSET(p)              \
++      ef_assert(p);                           \
++      EF_VI_MAGIC_CHECK((p), EF_IOBUFSET)
++
++# define EF_VI_CHECK_FILTER(p)                        \
++      ef_assert(p);                           \
++      EF_VI_MAGIC_CHECK((p), EF_FILTER);
++
++# define EF_VI_CHECK_SHMBUF(p)                        \
++      ef_assert(p);                           \
++      EF_VI_MAGIC_CHECK((p), EF_SHMBUF);
++
++#endif
++
++#ifndef NDEBUG
++# define EF_DRIVER_MAGIC 0x00f00ba4
++# define EF_ASSERT_THIS_DRIVER_VALID(driver)                          \
++      do{ ef_assert(driver);                                          \
++              EF_VI_MAGIC_CHECK((driver), EF_DRIVER_MAGIC);           \
++              ef_assert((driver)->init);               }while(0)
++
++# define EF_ASSERT_DRIVER_VALID() EF_ASSERT_THIS_DRIVER_VALID(&ci_driver)
++#else
++# define EF_ASSERT_THIS_DRIVER_VALID(driver)
++# define EF_ASSERT_DRIVER_VALID()
++#endif
++
++
++/* *************************************
++ * Power of 2 FIFO
++ */
++
++#define EF_VI_FIFO2_M(f, x)  ((x) & ((f)->fifo_mask))
++#define ef_vi_fifo2_valid(f) ((f) && (f)->fifo && (f)->fifo_mask > 0 &&       \
++                            (f)->fifo_rd_i <= (f)->fifo_mask       && \
++                            (f)->fifo_wr_i <= (f)->fifo_mask       && \
++                            EF_VI_IS_POW2((f)->fifo_mask+1u))
++
++#define ef_vi_fifo2_init(f, cap)                      \
++      do{ ef_assert(EF_VI_IS_POW2((cap) + 1));        \
++              (f)->fifo_rd_i = (f)->fifo_wr_i = 0u;   \
++              (f)->fifo_mask = (cap);                 \
++      }while(0)
++
++#define ef_vi_fifo2_is_empty(f) ((f)->fifo_rd_i == (f)->fifo_wr_i)
++#define ef_vi_fifo2_capacity(f) ((f)->fifo_mask)
++#define ef_vi_fifo2_buf_size(f) ((f)->fifo_mask + 1u)
++#define ef_vi_fifo2_end(f)      ((f)->fifo + ef_vi_fifo2_buf_size(f))
++#define ef_vi_fifo2_peek(f)     ((f)->fifo[(f)->fifo_rd_i])
++#define ef_vi_fifo2_poke(f)     ((f)->fifo[(f)->fifo_wr_i])
++#define ef_vi_fifo2_num(f)   EF_VI_FIFO2_M((f),(f)->fifo_wr_i-(f)->fifo_rd_i)
++
++#define ef_vi_fifo2_wr_prev(f)                                                \
++      do{ (f)->fifo_wr_i = EF_VI_FIFO2_M((f), (f)->fifo_wr_i - 1u); }while(0)
++#define ef_vi_fifo2_wr_next(f)                                                \
++      do{ (f)->fifo_wr_i = EF_VI_FIFO2_M((f), (f)->fifo_wr_i + 1u); }while(0)
++#define ef_vi_fifo2_rd_adv(f, n)                                      \
++      do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i + (n)); }while(0)
++#define ef_vi_fifo2_rd_prev(f)                                                \
++      do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i - 1u); }while(0)
++#define ef_vi_fifo2_rd_next(f)                                                \
++      do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i + 1u); }while(0)
++
++#define ef_vi_fifo2_put(f, v)                                         \
++      do{ ef_vi_fifo2_poke(f) = (v); ef_vi_fifo2_wr_next(f); }while(0)
++#define ef_vi_fifo2_get(f, pv)                                                \
++      do{ *(pv) = ef_vi_fifo2_peek(f); ef_vi_fifo2_rd_next(f); }while(0)
++
++
++/* *********************************************************************
++ * Eventq handling
++ */
++
++typedef union {
++      uint64_t    u64;
++      struct {
++              uint32_t  a;
++              uint32_t  b;
++      } opaque;
++} ef_vi_event;
++
++
++#define EF_VI_EVENT_OFFSET(q, i)                                      \
++      (((q)->evq_state->evq_ptr - (i) * sizeof(ef_vi_event)) & (q)->evq_mask)
++
++#define EF_VI_EVENT_PTR(q, i)                                           \
++      ((ef_vi_event*) ((q)->evq_base + EF_VI_EVENT_OFFSET((q), (i))))
++
++/* *********************************************************************
++ * Miscellaneous goodies
++ */
++#ifdef NDEBUG
++# define EF_VI_DEBUG(x)
++#else
++# define EF_VI_DEBUG(x)            x
++#endif
++
++#define EF_VI_ROUND_UP(i, align)   (((i)+(align)-1u) & ~((align)-1u))
++#define EF_VI_ALIGN_FWD(p, align)  (((p)+(align)-1u) & ~((align)-1u))
++#define EF_VI_ALIGN_BACK(p, align) ((p) & ~((align)-1u))
++#define EF_VI_PTR_ALIGN_BACK(p, align)                                        \
++      ((char*)EF_VI_ALIGN_BACK(((intptr_t)(p)), ((intptr_t)(align))))
++#define EF_VI_IS_POW2(x)           ((x) && ! ((x) & ((x) - 1)))
++
++
++/* ******************************************************************** 
++ */
++
++extern void falcon_vi_init(ef_vi*, void* vvis ) EF_VI_HF;
++extern void ef_eventq_state_init(ef_vi* evq) EF_VI_HF;
++extern void __ef_init(void) EF_VI_HF;
++
++
++#endif  /* __CI_EF_VI_INTERNAL_H__ */
++
diff --cc drivers/xen/sfc_netfront/etherfabric/ef_vi.h

index 0000000,0000000..6b1bef0

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/etherfabric/ef_vi.h
@@@ -1,0 -1,0 +1,647 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ *  \brief  Virtual Interface
++ *   \date  2007/05/16
++ */
++
++#ifndef __EFAB_EF_VI_H__
++#define __EFAB_EF_VI_H__
++
++
++/**********************************************************************
++ * Primitive types ****************************************************
++ **********************************************************************/
++
++/* We standardise on the types from stdint.h and synthesise these types
++ * for compilers/platforms that don't provide them */
++
++#  include <linux/types.h>
++# define EF_VI_ALIGN(x) __attribute__ ((aligned (x)))
++# define ef_vi_inline static inline
++
++
++
++/**********************************************************************
++ * Types **************************************************************
++ **********************************************************************/
++
++typedef uint32_t                ef_eventq_ptr;
++
++typedef uint64_t                ef_addr;
++typedef char*                   ef_vi_ioaddr_t;
++
++/**********************************************************************
++ * ef_event ***********************************************************
++ **********************************************************************/
++
++/*! \i_ef_vi A DMA request identifier.
++**
++** This is an integer token specified by the transport and associated
++** with a DMA request.  It is returned to the VI user with DMA completion
++** events.  It is typically used to identify the buffer associated with
++** the transfer.
++*/
++typedef int                   ef_request_id;
++
++typedef union {
++      uint64_t  u64[1];
++      uint32_t  u32[2];
++} ef_vi_qword;
++
++typedef ef_vi_qword             ef_hw_event;
++
++#define EF_REQUEST_ID_BITS      16u
++#define EF_REQUEST_ID_MASK      ((1u << EF_REQUEST_ID_BITS) - 1u)
++
++/*! \i_ef_event An [ef_event] is a token that identifies something that
++** has happened.  Examples include packets received, packets transmitted
++** and errors.
++*/
++typedef union {
++      struct {
++              ef_hw_event    ev;
++              unsigned       type       :16;
++      } generic;
++      struct {
++              ef_hw_event    ev;
++              unsigned       type       :16;
++              /*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
++              unsigned       q_id       :16;
++              unsigned       len        :16;
++              unsigned       flags      :16;
++      } rx;
++      struct {  /* This *must* have same layout as [rx]. */
++              ef_hw_event    ev;
++              unsigned       type       :16;
++              /*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
++              unsigned       q_id       :16;
++              unsigned       len        :16;
++              unsigned       flags      :16;
++              unsigned       subtype    :16;
++      } rx_discard;
++      struct {
++              ef_hw_event    ev;
++              unsigned       type       :16;
++              /*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
++              unsigned       q_id       :16;
++      } tx;
++      struct {
++              ef_hw_event    ev;
++              unsigned       type       :16;
++              /*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
++              unsigned       q_id       :16;
++              unsigned       subtype    :16;
++      } tx_error;
++      struct {
++              ef_hw_event    ev;
++              unsigned       type       :16;
++              unsigned       q_id       :16;
++      } rx_no_desc_trunc;
++      struct {
++              ef_hw_event    ev;
++              unsigned       type       :16;
++              unsigned       data;
++      } sw;
++} ef_event;
++
++
++#define EF_EVENT_TYPE(e)        ((e).generic.type)
++enum {
++      /** Good data was received. */
++      EF_EVENT_TYPE_RX,
++      /** Packets have been sent. */
++      EF_EVENT_TYPE_TX,
++      /** Data received and buffer consumed, but something is wrong. */
++      EF_EVENT_TYPE_RX_DISCARD,
++      /** Transmit of packet failed. */
++      EF_EVENT_TYPE_TX_ERROR,
++      /** Received packet was truncated due to lack of descriptors. */
++      EF_EVENT_TYPE_RX_NO_DESC_TRUNC,
++      /** Software generated event. */
++      EF_EVENT_TYPE_SW,
++      /** Event queue overflow. */
++      EF_EVENT_TYPE_OFLOW,
++};
++
++#define EF_EVENT_RX_BYTES(e)    ((e).rx.len)
++#define EF_EVENT_RX_Q_ID(e)     ((e).rx.q_id)
++#define EF_EVENT_RX_CONT(e)     ((e).rx.flags & EF_EVENT_FLAG_CONT)
++#define EF_EVENT_RX_SOP(e)      ((e).rx.flags & EF_EVENT_FLAG_SOP)
++#define EF_EVENT_RX_ISCSI_OKAY(e) ((e).rx.flags & EF_EVENT_FLAG_ISCSI_OK)
++#define EF_EVENT_FLAG_SOP       0x1
++#define EF_EVENT_FLAG_CONT      0x2
++#define EF_EVENT_FLAG_ISCSI_OK  0x4
++
++#define EF_EVENT_TX_Q_ID(e)     ((e).tx.q_id)
++
++#define EF_EVENT_RX_DISCARD_Q_ID(e)  ((e).rx_discard.q_id)
++#define EF_EVENT_RX_DISCARD_LEN(e)   ((e).rx_discard.len)
++#define EF_EVENT_RX_DISCARD_TYPE(e)  ((e).rx_discard.subtype)
++enum {
++      EF_EVENT_RX_DISCARD_CSUM_BAD,
++      EF_EVENT_RX_DISCARD_CRC_BAD,
++      EF_EVENT_RX_DISCARD_TRUNC,
++      EF_EVENT_RX_DISCARD_RIGHTS,
++      EF_EVENT_RX_DISCARD_OTHER,
++};
++
++#define EF_EVENT_TX_ERROR_Q_ID(e)    ((e).tx_error.q_id)
++#define EF_EVENT_TX_ERROR_TYPE(e)    ((e).tx_error.subtype)
++enum {
++      EF_EVENT_TX_ERROR_RIGHTS,
++      EF_EVENT_TX_ERROR_OFLOW,
++      EF_EVENT_TX_ERROR_2BIG,
++      EF_EVENT_TX_ERROR_BUS,
++};
++
++#define EF_EVENT_RX_NO_DESC_TRUNC_Q_ID(e)  ((e).rx_no_desc_trunc.q_id)
++
++#define EF_EVENT_SW_DATA_MASK   0xffff
++#define EF_EVENT_SW_DATA(e)     ((e).sw.data)
++
++#define EF_EVENT_FMT            "[ev:%x:%08x:%08x]"
++#define EF_EVENT_PRI_ARG(e)     (unsigned) (e).generic.type,    \
++              (unsigned) (e).generic.ev.u32[1],               \
++              (unsigned) (e).generic.ev.u32[0]
++
++#define EF_GET_HW_EV(e)         ((e).generic.ev)
++#define EF_GET_HW_EV_PTR(e)     (&(e).generic.ev)
++#define EF_GET_HW_EV_U64(e)     ((e).generic.ev.u64[0])
++
++
++/* ***************** */
++
++/*! Used by netif shared state. Must use types of explicit size. */
++typedef struct {
++      uint16_t              rx_last_desc_ptr;   /* for RX duplicates       */
++      uint8_t               bad_sop;            /* bad SOP detected        */
++      uint8_t               frag_num;           /* next fragment #, 0=>SOP */
++} ef_rx_dup_state_t;
++
++
++/* Max number of ports on any SF NIC. */
++#define EFAB_DMAQS_PER_EVQ_MAX 32
++
++typedef struct {
++      ef_eventq_ptr           evq_ptr;
++      int32_t               trashed;
++      ef_rx_dup_state_t     rx_dup_state[EFAB_DMAQS_PER_EVQ_MAX];
++} ef_eventq_state;
++
++
++/*! \i_ef_base [ef_iovec] is similar the standard [struct iovec].  An
++** array of these is used to designate a scatter/gather list of I/O
++** buffers.
++*/
++typedef struct {
++      ef_addr                       iov_base EF_VI_ALIGN(8);
++      unsigned                      iov_len;
++} ef_iovec;
++
++/* Falcon constants */
++#define TX_EV_DESC_PTR_LBN 0
++
++
++/**********************************************************************
++ * ef_vi **************************************************************
++ **********************************************************************/
++
++enum ef_vi_flags {
++      EF_VI_RX_SCATTER        = 0x1,
++      EF_VI_ISCSI_RX_HDIG     = 0x2,
++      EF_VI_ISCSI_TX_HDIG     = 0x4,
++      EF_VI_ISCSI_RX_DDIG     = 0x8,
++      EF_VI_ISCSI_TX_DDIG     = 0x10,
++      EF_VI_TX_PHYS_ADDR      = 0x20,
++      EF_VI_RX_PHYS_ADDR      = 0x40,
++      EF_VI_TX_IP_CSUM_DIS    = 0x80,
++      EF_VI_TX_TCPUDP_CSUM_DIS= 0x100,
++      EF_VI_TX_TCPUDP_ONLY    = 0x200,
++      /* Flags in range 0xXXXX0000 are for internal use. */
++};
++
++typedef struct {
++      uint32_t  added;
++      uint32_t  removed;
++} ef_vi_txq_state;
++
++typedef struct {
++      uint32_t  added;
++      uint32_t  removed;
++} ef_vi_rxq_state;
++
++typedef struct {
++      uint32_t         mask;
++      void*            doorbell;
++      void*            descriptors;
++      uint16_t*        ids;
++      unsigned         misalign_mask;
++} ef_vi_txq;
++
++typedef struct {
++      uint32_t         mask;
++      void*            doorbell;
++      void*            descriptors;
++      uint16_t*        ids;
++} ef_vi_rxq;
++
++typedef struct {
++      ef_eventq_state  evq;
++      ef_vi_txq_state  txq;
++      ef_vi_rxq_state  rxq;
++      /* Followed by request id fifos. */
++} ef_vi_state;
++
++/*! \i_ef_vi  A virtual interface.
++**
++** An [ef_vi] represents a virtual interface on a specific NIC.  A
++** virtual interface is a collection of an event queue and two DMA queues
++** used to pass Ethernet frames between the transport implementation and
++** the network.
++*/
++typedef struct ef_vi {
++      unsigned                        magic;
++
++      unsigned                      vi_resource_id;
++      unsigned                      vi_resource_handle_hack;
++      unsigned                      vi_i;
++
++      char*                           vi_mem_mmap_ptr;
++      int                           vi_mem_mmap_bytes;
++      char*                           vi_io_mmap_ptr;
++      int                           vi_io_mmap_bytes;
++
++      ef_eventq_state*              evq_state;
++      char*                         evq_base;
++      unsigned                      evq_mask;
++      ef_vi_ioaddr_t                evq_timer_reg;
++
++      ef_vi_txq                     vi_txq;
++      ef_vi_rxq                     vi_rxq;
++      ef_vi_state*                  ep_state;
++      enum ef_vi_flags              vi_flags;
++} ef_vi;
++
++
++enum ef_vi_arch {
++      EF_VI_ARCH_FALCON,
++};
++
++
++struct ef_vi_nic_type {
++      unsigned char  arch;
++      char           variant;
++      unsigned char  revision;
++};
++
++
++/* This structure is opaque to the client & used to pass mapping data
++ * from the resource manager to the ef_vi lib. for ef_vi_init().
++ */
++struct vi_mappings {
++      uint32_t         signature;
++# define VI_MAPPING_VERSION   0x02  /*Byte: Increment me if struct altered*/
++# define VI_MAPPING_SIGNATURE (0xBA1150 + VI_MAPPING_VERSION)
++
++      struct ef_vi_nic_type nic_type;
++
++      int              vi_instance;
++
++      unsigned         evq_bytes;
++      char*            evq_base;
++      ef_vi_ioaddr_t   evq_timer_reg;
++
++      unsigned         rx_queue_capacity;
++      ef_vi_ioaddr_t   rx_dma_ef1;
++      char*            rx_dma_falcon;
++      ef_vi_ioaddr_t   rx_bell;
++
++      unsigned         tx_queue_capacity;
++      ef_vi_ioaddr_t   tx_dma_ef1;
++      char*            tx_dma_falcon;
++      ef_vi_ioaddr_t   tx_bell;
++};
++/* This is used by clients to allocate a suitably sized buffer for the 
++ * resource manager to fill & ef_vi_init() to use. */
++#define VI_MAPPINGS_SIZE (sizeof(struct vi_mappings))
++
++
++/**********************************************************************
++ * ef_config **********************************************************
++ **********************************************************************/
++
++struct ef_config_t {
++      int   log;                    /* debug logging level          */
++};
++
++extern struct ef_config_t  ef_config;
++
++
++/**********************************************************************
++ * ef_vi **************************************************************
++ **********************************************************************/
++
++/* Initialise [data_area] with information required to initialise an ef_vi.
++ * In the following, an unused param should be set to NULL. Note the case
++ * marked (*) of [iobuf_mmap] for falcon/driver; for normal driver this
++ * must be NULL.
++ *
++ * \param  data_area     [in,out] required, must ref at least VI_MAPPINGS_SIZE 
++ *                                bytes
++ * \param  evq_capacity  [in] number of events in event queue.  Specify 0 for
++ *                            no event queue.
++ * \param  rxq_capacity  [in] number of descriptors in RX DMA queue.  Specify
++ *                            0 for no RX queue.
++ * \param  txq_capacity  [in] number of descriptors in TX DMA queue.  Specify
++ *                            0 for no TX queue.
++ * \param  mmap_info     [in] mem-map info for resource
++ * \param  io_mmap       [in] ef1,    required
++ *                            falcon, required
++ * \param  iobuf_mmap    [in] ef1,    UL: unused
++ *                            falcon, UL: required
++ */
++extern void ef_vi_init_mapping_vi(void* data_area, struct ef_vi_nic_type,
++                                  unsigned rxq_capacity,
++                                  unsigned txq_capacity, int instance,
++                                  void* io_mmap, void* iobuf_mmap_rx,
++                                  void* iobuf_mmap_tx, enum ef_vi_flags);
++
++
++extern void ef_vi_init_mapping_evq(void* data_area, struct ef_vi_nic_type,
++                                   int instance, unsigned evq_bytes,
++                                   void* base, void* timer_reg);
++
++ef_vi_inline unsigned ef_vi_resource_id(ef_vi* vi)
++{ 
++      return vi->vi_resource_id; 
++}
++
++ef_vi_inline enum ef_vi_flags ef_vi_flags(ef_vi* vi)
++{ 
++      return vi->vi_flags; 
++}
++
++
++/**********************************************************************
++ * Receive interface **************************************************
++ **********************************************************************/
++
++/*! \i_ef_vi Returns the amount of space in the RX descriptor ring.
++**
++** \return the amount of space in the queue.
++*/
++ef_vi_inline int ef_vi_receive_space(ef_vi* vi) 
++{
++      ef_vi_rxq_state* qs = &vi->ep_state->rxq;
++      return vi->vi_rxq.mask - (qs->added - qs->removed);
++}
++
++
++/*! \i_ef_vi Returns the fill level of the RX descriptor ring.
++**
++** \return the fill level of the queue.
++*/
++ef_vi_inline int ef_vi_receive_fill_level(ef_vi* vi) 
++{
++      ef_vi_rxq_state* qs = &vi->ep_state->rxq;
++      return qs->added - qs->removed;
++}
++
++
++ef_vi_inline int ef_vi_receive_capacity(ef_vi* vi)
++{ 
++      return vi->vi_rxq.mask;
++}
++
++/*! \i_ef_vi  Complete a receive operation.
++**
++** When a receive completion event is received, it should be passed to
++** this function.  The request-id for the buffer that the packet was
++** delivered to is returned.
++**
++** After this function returns, more space may be available in the
++** receive queue.
++*/
++extern ef_request_id ef_vi_receive_done(const ef_vi*, const ef_event*);
++
++/*! \i_ef_vi  Return request ID indicated by a receive event
++ */
++ef_vi_inline ef_request_id ef_vi_receive_request_id(const ef_vi* vi,
++                                                    const ef_event* ef_ev)
++{
++      const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*ef_ev);
++      return ev->u32[0] & vi->vi_rxq.mask;
++}
++  
++
++/*! \i_ef_vi  Form a receive descriptor.
++**
++** If \c initial_rx_bytes is zero use a reception size at least as large
++** as an MTU.
++*/
++extern int ef_vi_receive_init(ef_vi* vi, ef_addr addr, ef_request_id dma_id,
++                              int intial_rx_bytes);
++
++/*! \i_ef_vi  Submit initialised receive descriptors to the NIC. */
++extern void ef_vi_receive_push(ef_vi* vi);
++
++/*! \i_ef_vi  Post a buffer on the receive queue.
++**
++**   \return 0 on success, or -EAGAIN if the receive queue is full
++*/
++extern int ef_vi_receive_post(ef_vi*, ef_addr addr,
++                            ef_request_id dma_id);
++
++/**********************************************************************
++ * Transmit interface *************************************************
++ **********************************************************************/
++
++/*! \i_ef_vi Return the amount of space (in descriptors) in the transmit
++**           queue.
++**
++** \return the amount of space in the queue (in descriptors)
++*/
++ef_vi_inline int ef_vi_transmit_space(ef_vi* vi) 
++{
++      ef_vi_txq_state* qs = &vi->ep_state->txq;
++      return vi->vi_txq.mask - (qs->added - qs->removed);
++}
++
++
++/*! \i_ef_vi Returns the fill level of the TX descriptor ring.
++**
++** \return the fill level of the queue.
++*/
++ef_vi_inline int ef_vi_transmit_fill_level(ef_vi* vi)
++{
++      ef_vi_txq_state* qs = &vi->ep_state->txq;
++      return qs->added - qs->removed;
++}
++
++
++/*! \i_ef_vi Returns the total capacity of the TX descriptor ring.
++**
++** \return the capacity of the queue.
++*/
++ef_vi_inline int ef_vi_transmit_capacity(ef_vi* vi)
++{ 
++      return vi->vi_txq.mask;
++}
++
++
++/*! \i_ef_vi  Transmit a packet.
++**
++**   \param bytes must be greater than ETH_ZLEN.
++**   \return -EAGAIN if the transmit queue is full, or 0 on success
++*/
++extern int ef_vi_transmit(ef_vi*, ef_addr, int bytes, ef_request_id dma_id);
++
++/*! \i_ef_vi  Transmit a packet using a gather list.
++**
++**   \param iov_len must be greater than zero
++**   \param iov the first must be non-zero in length (but others need not)
++**
++**   \return -EAGAIN if the queue is full, or 0 on success
++*/
++extern int ef_vi_transmitv(ef_vi*, const ef_iovec* iov, int iov_len,
++                           ef_request_id dma_id);
++
++/*! \i_ef_vi  Initialise a DMA request.
++**
++** \return -EAGAIN if the queue is full, or 0 on success
++*/
++extern int ef_vi_transmit_init(ef_vi*, ef_addr, int bytes,
++                               ef_request_id dma_id);
++
++/*! \i_ef_vi  Initialise a DMA request.
++**
++** \return -EAGAIN if the queue is full, or 0 on success
++*/
++extern int ef_vi_transmitv_init(ef_vi*, const ef_iovec*, int iov_len,
++                                ef_request_id dma_id);
++
++/*! \i_ef_vi  Submit DMA requests to the NIC.
++**
++** The DMA requests must have been initialised using
++** ef_vi_transmit_init() or ef_vi_transmitv_init().
++*/
++extern void ef_vi_transmit_push(ef_vi*);
++
++
++/*! \i_ef_vi Maximum number of transmit completions per transmit event. */
++#define EF_VI_TRANSMIT_BATCH  64
++
++/*! \i_ef_vi Determine the set of [ef_request_id]s for each DMA request
++**           which has been completed by a given transmit completion
++**           event.
++**
++** \param ids must point to an array of length EF_VI_TRANSMIT_BATCH
++** \return the number of valid [ef_request_id]s (can be zero)
++*/
++extern int ef_vi_transmit_unbundle(ef_vi* ep, const ef_event*,
++                                   ef_request_id* ids);
++
++
++/*! \i_ef_event Returns true if ef_eventq_poll() will return event(s). */
++extern int ef_eventq_has_event(ef_vi* vi);
++
++/*! \i_ef_event Returns true if there are quite a few events in the event
++** queue.
++**
++** This looks ahead in the event queue, so has the property that it will
++** not ping-pong a cache-line when it is called concurrently with events
++** being delivered.
++*/
++extern int ef_eventq_has_many_events(ef_vi* evq, int look_ahead);
++
++/*! Type of function to handle unknown events arriving on event queue
++**  Return CI_TRUE iff the event has been handled.
++*/
++typedef int/*bool*/ ef_event_handler_fn(void* priv, ef_vi* evq, ef_event* ev);
++
++/*! Standard poll exception routine */
++extern int/*bool*/ ef_eventq_poll_exception(void* priv, ef_vi* evq,
++                                            ef_event* ev);
++
++/*! \i_ef_event  Retrieve events from the event queue, handle RX/TX events
++**  and pass any others to an exception handler function
++**
++**   \return The number of events retrieved.
++*/
++extern int ef_eventq_poll_evs(ef_vi* evq, ef_event* evs, int evs_len,
++                              ef_event_handler_fn *exception, void *expt_priv);
++
++/*! \i_ef_event  Retrieve events from the event queue.
++**
++**   \return The number of events retrieved.
++*/
++ef_vi_inline int ef_eventq_poll(ef_vi* evq, ef_event* evs, int evs_len)
++{
++      return ef_eventq_poll_evs(evq, evs, evs_len,
++                            &ef_eventq_poll_exception, (void*)0);
++}
++
++/*! \i_ef_event Returns the capacity of an event queue. */
++ef_vi_inline int ef_eventq_capacity(ef_vi* vi) 
++{
++      return (vi->evq_mask + 1u) / sizeof(ef_hw_event);
++}
++
++/* Returns the instance ID of [vi] */
++ef_vi_inline unsigned ef_vi_instance(ef_vi* vi)
++{ return vi->vi_i; }
++
++
++/**********************************************************************
++ * Initialisation *****************************************************
++ **********************************************************************/
++
++/*! Return size of state buffer of an initialised VI. */
++extern int ef_vi_state_bytes(ef_vi*);
++
++/*! Return size of buffer needed for VI state given sizes of RX and TX
++** DMA queues.  Queue sizes must be legal sizes (power of 2), or 0 (no
++** queue).
++*/
++extern int ef_vi_calc_state_bytes(int rxq_size, int txq_size);
++
++/*! Initialise [ef_vi] from the provided resources. [vvis] must have been
++** created by ef_make_vi_data() & remains owned by the caller.
++*/
++extern void ef_vi_init(ef_vi*, void* vi_info, ef_vi_state* state,
++                       ef_eventq_state* evq_state, enum ef_vi_flags);
++
++extern void ef_vi_state_init(ef_vi*);
++extern void ef_eventq_state_init(ef_vi*);
++
++/*! Convert an efhw device arch to ef_vi_arch, or returns -1 if not
++** recognised.
++*/
++extern int  ef_vi_arch_from_efhw_arch(int efhw_arch);
++
++
++#endif /* __EFAB_EF_VI_H__ */
diff --cc drivers/xen/sfc_netfront/falcon_event.c

index 0000000,0000000..dd9cc15

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/falcon_event.c
@@@ -1,0 -1,0 +1,346 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * \author  djr
++ *  \brief  Routine to poll event queues.
++ *   \date  2003/03/04
++ */
++
++/*! \cidoxg_lib_ef */
++#include "ef_vi_internal.h"
++
++/* Be worried about this on byteswapped machines */
++/* Due to crazy chipsets, we see the event words being written in
++** arbitrary order (bug4539).  So test for presence of event must ensure
++** that both halves have changed from the null.
++*/
++# define EF_VI_IS_EVENT(evp)                                          \
++      ( (((evp)->opaque.a != (uint32_t)-1) &&                         \
++         ((evp)->opaque.b != (uint32_t)-1)) )
++
++
++#ifdef NDEBUG
++# define IS_DEBUG 0
++#else
++# define IS_DEBUG 1
++#endif
++
++
++/*! Check for RX events with inconsistent SOP/CONT
++**
++** Returns true if this event should be discarded
++*/
++ef_vi_inline int ef_eventq_is_rx_sop_cont_bad_efab(ef_vi* vi,
++                                                 const ef_vi_qword* ev)
++{
++      ef_rx_dup_state_t* rx_dup_state;
++      uint8_t* bad_sop;
++
++      unsigned label = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
++      unsigned sop   = QWORD_TEST_BIT(RX_SOP, *ev);
++  
++      ef_assert(vi);
++      ef_assert_lt(label, EFAB_DMAQS_PER_EVQ_MAX);
++
++      rx_dup_state = &vi->evq_state->rx_dup_state[label];
++      bad_sop = &rx_dup_state->bad_sop;
++
++      if( ! ((vi->vi_flags & EF_VI_BUG5692_WORKAROUND) || IS_DEBUG) ) {
++              *bad_sop = (*bad_sop && !sop);
++      }
++      else {
++              unsigned cont  = QWORD_TEST_BIT(RX_JUMBO_CONT, *ev);
++              uint8_t *frag_num = &rx_dup_state->frag_num;
++
++              /* bad_sop should latch till the next sop */
++              *bad_sop = (*bad_sop && !sop) || ( !!sop != (*frag_num==0) );
++
++              /* we do not check the number of bytes relative to the
++               * fragment number and size of the user rx buffer here
++               * because we don't know the size of the user rx
++               * buffer - we probably should perform this check in
++               * the nearest code calling this though.
++               */
++              *frag_num = cont ? (*frag_num + 1) : 0;
++      }
++
++      return *bad_sop;
++}
++
++
++ef_vi_inline int falcon_rx_check_dup(ef_vi* evq, ef_event* ev_out,
++                                   const ef_vi_qword* ev)
++{
++      unsigned q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
++      uint16_t desc_ptr = QWORD_GET_U(RX_EV_DESC_PTR, *ev);
++      ef_rx_dup_state_t* rx_dup_state = &evq->evq_state->rx_dup_state[q_id];
++
++      if(likely( desc_ptr != rx_dup_state->rx_last_desc_ptr )) {
++              rx_dup_state->rx_last_desc_ptr = desc_ptr;
++              return 0;
++      }
++
++      rx_dup_state->rx_last_desc_ptr = desc_ptr;
++      rx_dup_state->bad_sop = 1;
++#ifndef NDEBUG
++      rx_dup_state->frag_num = 0;
++#endif
++      BUG_ON(!QWORD_TEST_BIT(RX_EV_FRM_TRUNC, *ev));
++      BUG_ON( QWORD_TEST_BIT(RX_EV_PKT_OK, *ev));
++      BUG_ON(!QWORD_GET_U(RX_EV_BYTE_CNT, *ev) == 0);
++      ev_out->rx_no_desc_trunc.type = EF_EVENT_TYPE_RX_NO_DESC_TRUNC;
++      ev_out->rx_no_desc_trunc.q_id = q_id;
++      return 1;
++}
++
++
++ef_vi_inline void falcon_rx_event(ef_event* ev_out, const ef_vi_qword* ev)
++{
++      if(likely( QWORD_TEST_BIT(RX_EV_PKT_OK, *ev) )) {
++              ev_out->rx.type = EF_EVENT_TYPE_RX;
++              ev_out->rx.q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
++              ev_out->rx.len  = QWORD_GET_U(RX_EV_BYTE_CNT, *ev);
++              if( QWORD_TEST_BIT(RX_SOP, *ev) )
++                      ev_out->rx.flags = EF_EVENT_FLAG_SOP;
++              else
++                      ev_out->rx.flags = 0;
++              if( QWORD_TEST_BIT(RX_JUMBO_CONT, *ev) )
++                      ev_out->rx.flags |= EF_EVENT_FLAG_CONT;
++              if( QWORD_TEST_BIT(RX_iSCSI_PKT_OK, *ev) )
++                      ev_out->rx.flags |= EF_EVENT_FLAG_ISCSI_OK;
++      }
++      else {
++              ev_out->rx_discard.type = EF_EVENT_TYPE_RX_DISCARD;
++              ev_out->rx_discard.q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
++              ev_out->rx_discard.len  = QWORD_GET_U(RX_EV_BYTE_CNT, *ev);
++#if 1  /* hack for ptloop compatability: ?? TODO purge */
++              if( QWORD_TEST_BIT(RX_SOP, *ev) )
++                      ev_out->rx_discard.flags = EF_EVENT_FLAG_SOP;
++              else
++                      ev_out->rx_discard.flags = 0;
++              if( QWORD_TEST_BIT(RX_JUMBO_CONT, *ev) )
++                      ev_out->rx_discard.flags |= EF_EVENT_FLAG_CONT;
++              if( QWORD_TEST_BIT(RX_iSCSI_PKT_OK, *ev) )
++                      ev_out->rx_discard.flags |= EF_EVENT_FLAG_ISCSI_OK;
++#endif
++              /* Order matters here: more fundamental errors first. */
++              if( QWORD_TEST_BIT(RX_EV_BUF_OWNER_ID_ERR, *ev) )
++                      ev_out->rx_discard.subtype = 
++                              EF_EVENT_RX_DISCARD_RIGHTS;
++              else if( QWORD_TEST_BIT(RX_EV_FRM_TRUNC, *ev) )
++                      ev_out->rx_discard.subtype = 
++                              EF_EVENT_RX_DISCARD_TRUNC;
++              else if( QWORD_TEST_BIT(RX_EV_ETH_CRC_ERR, *ev) )
++                      ev_out->rx_discard.subtype = 
++                              EF_EVENT_RX_DISCARD_CRC_BAD;
++              else if( QWORD_TEST_BIT(RX_EV_IP_HDR_CHKSUM_ERR, *ev) )
++                      ev_out->rx_discard.subtype = 
++                              EF_EVENT_RX_DISCARD_CSUM_BAD;
++              else if( QWORD_TEST_BIT(RX_EV_TCP_UDP_CHKSUM_ERR, *ev) )
++                      ev_out->rx_discard.subtype = 
++                              EF_EVENT_RX_DISCARD_CSUM_BAD;
++              else
++                      ev_out->rx_discard.subtype = 
++                              EF_EVENT_RX_DISCARD_OTHER;
++      }
++}
++
++
++ef_vi_inline void falcon_tx_event(ef_event* ev_out, const ef_vi_qword* ev)
++{
++      /* Danger danger!  No matter what we ask for wrt batching, we
++      ** will get a batched event every 16 descriptors, and we also
++      ** get dma-queue-empty events.  i.e. Duplicates are expected.
++      **
++      ** In addition, if it's been requested in the descriptor, we
++      ** get an event per descriptor.  (We don't currently request
++      ** this).
++      */
++      if(likely( QWORD_TEST_BIT(TX_EV_COMP, *ev) )) {
++              ev_out->tx.type = EF_EVENT_TYPE_TX;
++              ev_out->tx.q_id = QWORD_GET_U(TX_EV_Q_LABEL, *ev);
++      }
++      else {
++              ev_out->tx_error.type = EF_EVENT_TYPE_TX_ERROR;
++              ev_out->tx_error.q_id = QWORD_GET_U(TX_EV_Q_LABEL, *ev);
++              if(likely( QWORD_TEST_BIT(TX_EV_BUF_OWNER_ID_ERR, *ev) ))
++                      ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_RIGHTS;
++              else if(likely( QWORD_TEST_BIT(TX_EV_WQ_FF_FULL, *ev) ))
++                      ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_OFLOW;
++              else if(likely( QWORD_TEST_BIT(TX_EV_PKT_TOO_BIG, *ev) ))
++                      ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_2BIG;
++              else if(likely( QWORD_TEST_BIT(TX_EV_PKT_ERR, *ev) ))
++                      ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_BUS;
++      }
++}
++
++
++static void mark_bad(ef_event* ev)
++{
++      ev->generic.ev.u64[0] &=~ ((uint64_t) 1u << RX_EV_PKT_OK_LBN);
++}
++
++
++int ef_eventq_poll_evs(ef_vi* evq, ef_event* evs, int evs_len,
++                     ef_event_handler_fn *exception, void *expt_priv)
++{
++      int evs_len_orig = evs_len;
++
++      EF_VI_CHECK_EVENT_Q(evq);
++      ef_assert(evs);
++      ef_assert_gt(evs_len, 0);
++
++      if(unlikely( EF_VI_IS_EVENT(EF_VI_EVENT_PTR(evq, 1)) ))
++              goto overflow;
++
++      do {
++              { /* Read the event out of the ring, then fiddle with
++                 * copied version.  Reason is that the ring is
++                 * likely to get pushed out of cache by another
++                 * event being delivered by hardware. */
++                      ef_vi_event* ev = EF_VI_EVENT_PTR(evq, 0);
++                      if( ! EF_VI_IS_EVENT(ev) )
++                              break;
++                      evs->generic.ev.u64[0] = cpu_to_le64 (ev->u64);
++                      evq->evq_state->evq_ptr += sizeof(ef_vi_event);
++                      ev->u64 = (uint64_t)(int64_t) -1;
++              }
++
++              /* Ugly: Exploit the fact that event code lies in top
++               * bits of event. */
++              ef_assert_ge(EV_CODE_LBN, 32u);
++              switch( evs->generic.ev.u32[1] >> (EV_CODE_LBN - 32u) ) {
++              case RX_IP_EV_DECODE:
++                      /* Look for duplicate desc_ptr: it signals
++                       * that a jumbo frame was truncated because we
++                       * ran out of descriptors. */
++                      if(unlikely( falcon_rx_check_dup
++                                         (evq, evs, &evs->generic.ev) )) {
++                              --evs_len;
++                              ++evs;
++                              break;
++                      }
++                      else {
++                              /* Cope with FalconA1 bugs where RX
++                               * gives inconsistent RX events Mark
++                               * events as bad until SOP becomes
++                               * consistent again
++                               * ef_eventq_is_rx_sop_cont_bad() has
++                               * side effects - order is important
++                               */
++                              if(unlikely
++                                 (ef_eventq_is_rx_sop_cont_bad_efab
++                                  (evq, &evs->generic.ev) )) {
++                                      mark_bad(evs);
++                              }
++                      }
++                      falcon_rx_event(evs, &evs->generic.ev);
++                      --evs_len;      
++                      ++evs;
++                      break;
++
++              case TX_IP_EV_DECODE:
++                      falcon_tx_event(evs, &evs->generic.ev);
++                      --evs_len;
++                      ++evs;
++                      break;
++
++              default:
++                      break;
++              }
++      } while( evs_len );
++
++      return evs_len_orig - evs_len;
++
++
++ overflow:
++      evs->generic.type = EF_EVENT_TYPE_OFLOW;
++      evs->generic.ev.u64[0] = (uint64_t)((int64_t)-1);
++      return 1;
++}
++
++
++int/*bool*/ ef_eventq_poll_exception(void* priv, ef_vi* evq, ef_event* ev)
++{
++      int /*bool*/ handled = 0;
++  
++      switch( ev->generic.ev.u32[1] >> (EV_CODE_LBN - 32u) ) {
++      case DRIVER_EV_DECODE:
++              if( QWORD_GET_U(DRIVER_EV_SUB_CODE, ev->generic.ev) ==
++                  EVQ_INIT_DONE_EV_DECODE )
++                      /* EVQ initialised event: ignore. */
++                      handled = 1;
++              break;
++      }
++      return handled;
++}
++
++
++void ef_eventq_iterate(ef_vi* vi,
++                     void (*fn)(void* arg, ef_vi*, int rel_pos,
++                                int abs_pos, void* event),
++                     void* arg, int stop_at_end)
++{
++      int i, size_evs = (vi->evq_mask + 1) / sizeof(ef_vi_event);
++
++      for( i = 0; i < size_evs; ++i ) {
++              ef_vi_event* e = EF_VI_EVENT_PTR(vi, -i);
++              if( EF_VI_IS_EVENT(e) )
++                      fn(arg, vi, i, 
++                         EF_VI_EVENT_OFFSET(vi, -i) / sizeof(ef_vi_event),
++                         e);
++              else if( stop_at_end )
++                      break;
++      }
++}
++
++
++int ef_eventq_has_event(ef_vi* vi)
++{
++      return EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, 0));
++}
++
++
++int ef_eventq_has_many_events(ef_vi* vi, int look_ahead)
++{
++      ef_assert_ge(look_ahead, 0);
++      return EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, -look_ahead));
++}
++
++
++int ef_eventq_has_rx_event(ef_vi* vi)
++{
++      ef_vi_event* ev;
++      int i, n_evs = 0;
++
++      for( i = 0;  EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, i)); --i ) {
++              ev = EF_VI_EVENT_PTR(vi, i);
++              if( EFVI_FALCON_EVENT_CODE(ev) == EF_EVENT_TYPE_RX )  n_evs++;
++      }
++      return n_evs;
++}
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netfront/falcon_vi.c

index 0000000,0000000..b6880e9

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/falcon_vi.c
@@@ -1,0 -1,0 +1,473 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * \author  djr, stg
++ *  \brief  Falcon-specific VI
++ *   \date  2006/11/30
++ */
++
++#include "ef_vi_internal.h"
++
++
++#define EFVI_FALCON_DMA_TX_FRAG               1
++
++
++/* TX descriptor for both physical and virtual packet transfers */
++typedef union {
++      uint32_t        dword[2];
++} ef_vi_falcon_dma_tx_buf_desc;
++typedef ef_vi_falcon_dma_tx_buf_desc ef_vi_falcon_dma_tx_phys_desc;
++
++
++/* RX descriptor for physical addressed transfers */
++typedef union {
++      uint32_t        dword[2];
++} ef_vi_falcon_dma_rx_phys_desc;
++
++
++/* RX descriptor for virtual packet transfers */
++typedef struct {
++      uint32_t        dword[1];
++} ef_vi_falcon_dma_rx_buf_desc;
++
++/* Buffer table index */
++typedef uint32_t              ef_vi_buffer_addr_t;
++
++ef_vi_inline int64_t dma_addr_to_u46(int64_t src_dma_addr)
++{
++      return (src_dma_addr & __FALCON_MASK(46, int64_t));
++}
++
++/*! Setup a physical address based descriptor with a specified length */
++ef_vi_inline void
++__falcon_dma_rx_calc_ip_phys(ef_vi_dma_addr_t dest_pa, 
++                           ef_vi_falcon_dma_rx_phys_desc *desc,
++                           int bytes)
++{
++      int region = 0;                 /* TODO fixme */
++      int64_t dest    = dma_addr_to_u46(dest_pa); /* lower 46 bits */
++
++      DWCHCK(__DW2(RX_KER_BUF_SIZE_LBN),  RX_KER_BUF_SIZE_WIDTH);
++      DWCHCK(__DW2(RX_KER_BUF_REGION_LBN),RX_KER_BUF_REGION_WIDTH);
++
++      LWCHK(RX_KER_BUF_ADR_LBN, RX_KER_BUF_ADR_WIDTH);
++
++      RANGECHCK(bytes,  RX_KER_BUF_SIZE_WIDTH);
++      RANGECHCK(region, RX_KER_BUF_REGION_WIDTH);
++
++      ef_assert(desc);
++
++      desc->dword[1] = ((bytes << __DW2(RX_KER_BUF_SIZE_LBN)) |
++                        (region << __DW2(RX_KER_BUF_REGION_LBN)) |
++                        (HIGH(dest,
++                              RX_KER_BUF_ADR_LBN, 
++                              RX_KER_BUF_ADR_WIDTH)));
++
++      desc->dword[0] = LOW(dest, 
++                           RX_KER_BUF_ADR_LBN, 
++                           RX_KER_BUF_ADR_WIDTH);
++}
++
++/*! Setup a virtual buffer descriptor for an IPMODE transfer */
++ef_vi_inline void
++__falcon_dma_tx_calc_ip_buf(unsigned buf_id, unsigned buf_ofs, unsigned bytes,
++                          int port, int frag, 
++                          ef_vi_falcon_dma_tx_buf_desc *desc)
++{
++      DWCHCK(__DW2(TX_USR_PORT_LBN), TX_USR_PORT_WIDTH);
++      DWCHCK(__DW2(TX_USR_CONT_LBN), TX_USR_CONT_WIDTH);
++      DWCHCK(__DW2(TX_USR_BYTE_CNT_LBN), TX_USR_BYTE_CNT_WIDTH);
++      LWCHK(RX_KER_BUF_ADR_LBN, RX_KER_BUF_ADR_WIDTH);
++      DWCHCK(TX_USR_BYTE_OFS_LBN, TX_USR_BYTE_OFS_WIDTH);
++
++      RANGECHCK(bytes,   TX_USR_BYTE_CNT_WIDTH);
++      RANGECHCK(port,    TX_USR_PORT_WIDTH);
++      RANGECHCK(frag,    TX_USR_CONT_WIDTH);
++      RANGECHCK(buf_id,  TX_USR_BUF_ID_WIDTH);
++      RANGECHCK(buf_ofs, TX_USR_BYTE_OFS_WIDTH);
++
++      ef_assert(desc);
++
++      desc->dword[1] = ((port   <<  __DW2(TX_USR_PORT_LBN))      | 
++                        (frag   <<  __DW2(TX_USR_CONT_LBN))      | 
++                        (bytes  <<  __DW2(TX_USR_BYTE_CNT_LBN))  |
++                        (HIGH(buf_id, 
++                              TX_USR_BUF_ID_LBN,
++                              TX_USR_BUF_ID_WIDTH)));
++
++      desc->dword[0] =  ((LOW(buf_id,
++                              TX_USR_BUF_ID_LBN,
++                              (TX_USR_BUF_ID_WIDTH))) |
++                         (buf_ofs << TX_USR_BYTE_OFS_LBN));
++}
++
++ef_vi_inline void
++falcon_dma_tx_calc_ip_buf_4k(unsigned buf_vaddr, unsigned bytes,
++                           int port, int frag, 
++                           ef_vi_falcon_dma_tx_buf_desc *desc)
++{
++      /* TODO FIXME [buf_vaddr] consists of the buffer index in the
++      ** high bits, and an offset in the low bits. Assumptions
++      ** permate the code that these can be rolled into one 32bit
++      ** value, so this is currently preserved for Falcon. But we
++      ** should change to support 8K pages
++      */
++      unsigned buf_id =  EFVI_FALCON_BUFFER_4K_PAGE(buf_vaddr);
++      unsigned buf_ofs = EFVI_FALCON_BUFFER_4K_OFF(buf_vaddr);
++
++      __falcon_dma_tx_calc_ip_buf( buf_id, buf_ofs, bytes, port, frag, desc);
++}
++
++ef_vi_inline void
++falcon_dma_tx_calc_ip_buf(unsigned buf_vaddr, unsigned bytes, int port, 
++                        int frag, ef_vi_falcon_dma_tx_buf_desc *desc)
++{
++      falcon_dma_tx_calc_ip_buf_4k(buf_vaddr, bytes, port, frag, desc);
++}
++
++/*! Setup a virtual buffer based descriptor */
++ef_vi_inline void
++__falcon_dma_rx_calc_ip_buf(unsigned buf_id, unsigned buf_ofs, 
++                          ef_vi_falcon_dma_rx_buf_desc *desc)
++{ 
++      /* check alignment of buffer offset and pack */
++      ef_assert((buf_ofs & 0x1) == 0);
++
++      buf_ofs >>= 1;
++
++      DWCHCK(RX_USR_2BYTE_OFS_LBN, RX_USR_2BYTE_OFS_WIDTH);
++      DWCHCK(RX_USR_BUF_ID_LBN, RX_USR_BUF_ID_WIDTH);
++
++      RANGECHCK(buf_ofs, RX_USR_2BYTE_OFS_WIDTH);
++      RANGECHCK(buf_id,  RX_USR_BUF_ID_WIDTH);
++
++      ef_assert(desc);
++
++      desc->dword[0] = ((buf_ofs << RX_USR_2BYTE_OFS_LBN) | 
++                        (buf_id  << RX_USR_BUF_ID_LBN));
++}
++
++ef_vi_inline void
++falcon_dma_rx_calc_ip_buf_4k(unsigned buf_vaddr, 
++                           ef_vi_falcon_dma_rx_buf_desc *desc)
++{ 
++      /* TODO FIXME [buf_vaddr] consists of the buffer index in the
++      ** high bits, and an offset in the low bits. Assumptions
++      ** permeate the code that these can be rolled into one 32bit
++      ** value, so this is currently preserved for Falcon. But we
++      ** should change to support 8K pages
++      */
++      unsigned buf_id =  EFVI_FALCON_BUFFER_4K_PAGE(buf_vaddr);
++      unsigned buf_ofs = EFVI_FALCON_BUFFER_4K_OFF(buf_vaddr);
++
++      __falcon_dma_rx_calc_ip_buf(buf_id, buf_ofs, desc);
++}
++
++ef_vi_inline void
++falcon_dma_rx_calc_ip_buf(unsigned buf_vaddr, 
++                        ef_vi_falcon_dma_rx_buf_desc *desc)
++{ 
++      falcon_dma_rx_calc_ip_buf_4k(buf_vaddr, desc);
++}
++
++
++ef_vi_inline ef_vi_dma_addr_t ef_physaddr(ef_addr efaddr)
++{
++      return (ef_vi_dma_addr_t) efaddr;
++}
++
++
++/*! Convert between an ef_addr and a buffer table index
++**  Assert that this was not a physical address
++*/
++ef_vi_inline ef_vi_buffer_addr_t ef_bufaddr(ef_addr efaddr)
++{
++      ef_assert(efaddr < ((uint64_t)1 << 32) );
++
++      return (ef_vi_buffer_addr_t) efaddr;
++}
++
++
++/*! Setup an physical address based descriptor for an IPMODE transfer */
++ef_vi_inline void
++falcon_dma_tx_calc_ip_phys(ef_vi_dma_addr_t src_dma_addr, unsigned bytes, 
++                         int port, int frag,
++                         ef_vi_falcon_dma_tx_phys_desc *desc)
++{
++
++      int region = 0; /* FIXME */
++      int64_t src    = dma_addr_to_u46(src_dma_addr); /* lower 46 bits */
++
++      DWCHCK(__DW2(TX_KER_PORT_LBN),      TX_KER_PORT_WIDTH);
++      DWCHCK(__DW2(TX_KER_CONT_LBN),      TX_KER_CONT_WIDTH);
++      DWCHCK(__DW2(TX_KER_BYTE_CNT_LBN),  TX_KER_BYTE_CNT_WIDTH);
++      DWCHCK(__DW2(TX_KER_BUF_REGION_LBN),TX_KER_BUF_REGION_WIDTH);
++
++      LWCHK(TX_KER_BUF_ADR_LBN, TX_KER_BUF_ADR_WIDTH);
++
++      RANGECHCK(port,   TX_KER_PORT_WIDTH);
++      RANGECHCK(frag,   TX_KER_CONT_WIDTH);
++      RANGECHCK(bytes,  TX_KER_BYTE_CNT_WIDTH);
++      RANGECHCK(region, TX_KER_BUF_REGION_WIDTH);
++
++      desc->dword[1] = ((port   <<  __DW2(TX_KER_PORT_LBN))      | 
++                        (frag   <<  __DW2(TX_KER_CONT_LBN))      | 
++                        (bytes  <<  __DW2(TX_KER_BYTE_CNT_LBN))  | 
++                        (region << __DW2(TX_KER_BUF_REGION_LBN)) |
++                        (HIGH(src,
++                              TX_KER_BUF_ADR_LBN, 
++                              TX_KER_BUF_ADR_WIDTH)));
++
++      ef_assert_equal(TX_KER_BUF_ADR_LBN, 0);
++      desc->dword[0] = (uint32_t) src_dma_addr;
++}
++
++
++void falcon_vi_init(ef_vi* vi, void* vvis)
++{
++      struct vi_mappings *vm = (struct vi_mappings*)vvis;
++      uint16_t* ids;
++
++      ef_assert(vi);
++      ef_assert(vvis);
++      ef_assert_equal(vm->signature, VI_MAPPING_SIGNATURE);
++      ef_assert_equal(vm->nic_type.arch, EF_VI_ARCH_FALCON);
++
++      /* Initialise masks to zero, so that ef_vi_state_init() will
++      ** not do any harm when we don't have DMA queues. */
++      vi->vi_rxq.mask = vi->vi_txq.mask = 0;
++
++      /* Used for BUG5391_WORKAROUND. */
++      vi->vi_txq.misalign_mask = 0;
++
++      /* Initialise doorbell addresses to a distinctive small value
++      ** which will cause a segfault, to trap doorbell pushes to VIs
++      ** without DMA queues. */
++      vi->vi_rxq.doorbell = vi->vi_txq.doorbell = (ef_vi_ioaddr_t)0xdb;
++
++      ids = (uint16_t*) (vi->ep_state + 1);
++
++      if( vm->tx_queue_capacity ) {
++              vi->vi_txq.mask = vm->tx_queue_capacity - 1;
++              vi->vi_txq.doorbell = vm->tx_bell + 12;
++              vi->vi_txq.descriptors = vm->tx_dma_falcon;
++              vi->vi_txq.ids = ids;
++              ids += vi->vi_txq.mask + 1;
++              /* Check that the id fifo fits in the space allocated. */
++              ef_assert_le((char*) (vi->vi_txq.ids + vm->tx_queue_capacity),
++                           (char*) vi->ep_state
++                           + ef_vi_calc_state_bytes(vm->rx_queue_capacity,
++                                                    vm->tx_queue_capacity));
++      }
++      if( vm->rx_queue_capacity ) {
++              vi->vi_rxq.mask = vm->rx_queue_capacity - 1;
++              vi->vi_rxq.doorbell = vm->rx_bell + 12;
++              vi->vi_rxq.descriptors = vm->rx_dma_falcon;
++              vi->vi_rxq.ids = ids;
++              /* Check that the id fifo fits in the space allocated. */
++              ef_assert_le((char*) (vi->vi_rxq.ids + vm->rx_queue_capacity),
++                           (char*) vi->ep_state
++                           + ef_vi_calc_state_bytes(vm->rx_queue_capacity,
++                                                    vm->tx_queue_capacity));
++      }
++
++      if( vm->nic_type.variant == 'A' ) {
++              vi->vi_txq.misalign_mask = 15;    /* BUG5391_WORKAROUND */
++              vi->vi_flags |= EF_VI_BUG5692_WORKAROUND;
++      }
++}
++
++
++int ef_vi_transmitv_init(ef_vi* vi, const ef_iovec* iov, int iov_len,
++                       ef_request_id dma_id)
++{
++      ef_vi_txq* q = &vi->vi_txq;
++      ef_vi_txq_state* qs = &vi->ep_state->txq;
++      ef_vi_falcon_dma_tx_buf_desc* dp;
++      unsigned len, dma_len, di;
++      unsigned added_save = qs->added;
++      ef_addr dma_addr;
++      unsigned last_len = 0;
++
++      ef_assert(iov_len > 0);
++      ef_assert(iov);
++      ef_assert_equal((dma_id & EF_REQUEST_ID_MASK), dma_id);
++      ef_assert_nequal(dma_id, 0xffff);
++
++      dma_addr = iov->iov_base;
++      len = iov->iov_len;
++
++      if( vi->vi_flags & EF_VI_ISCSI_TX_DDIG ) {
++              /* Last 4 bytes of placeholder for digest must be
++               * removed for h/w */
++              ef_assert(len > 4);
++              last_len = iov[iov_len - 1].iov_len;
++              if( last_len <= 4 ) {
++                      ef_assert(iov_len > 1);
++                      --iov_len;
++                      last_len = iov[iov_len - 1].iov_len - (4 - last_len);
++              }
++              else {
++                      last_len = iov[iov_len - 1].iov_len - 4;
++              }
++              if( iov_len == 1 )
++                      len = last_len;
++      }
++
++      while( 1 ) {
++              if( qs->added - qs->removed >= q->mask ) {
++                      qs->added = added_save;
++                      return -EAGAIN;
++              }
++
++              dma_len = (~((unsigned) dma_addr) & 0xfff) + 1;
++              if( dma_len > len )  dma_len = len;
++              { /* BUG5391_WORKAROUND */
++                      unsigned misalign = 
++                              (unsigned) dma_addr & q->misalign_mask;
++                      if( misalign && dma_len + misalign > 512 )
++                              dma_len = 512 - misalign;
++              }
++
++              di = qs->added++ & q->mask;
++              dp = (ef_vi_falcon_dma_tx_buf_desc*) q->descriptors + di;
++              if( vi->vi_flags & EF_VI_TX_PHYS_ADDR )
++                      falcon_dma_tx_calc_ip_phys
++                              (ef_physaddr(dma_addr), dma_len, /*port*/ 0,
++                               (iov_len == 1 && dma_len == len) ? 0 :
++                               EFVI_FALCON_DMA_TX_FRAG, dp);
++              else
++                      falcon_dma_tx_calc_ip_buf
++                              (ef_bufaddr(dma_addr), dma_len, /*port*/ 0,
++                               (iov_len == 1 && dma_len == len) ? 0 :
++                               EFVI_FALCON_DMA_TX_FRAG, dp);
++
++              dma_addr += dma_len;
++              len -= dma_len;
++
++              if( len == 0 ) {
++                      if( --iov_len == 0 )  break;
++                      ++iov;
++                      dma_addr = iov->iov_base;
++                      len = iov->iov_len;
++                      if( (vi->vi_flags & EF_VI_ISCSI_TX_DDIG) &&
++                          (iov_len == 1) )
++                              len = last_len;
++              }
++      }
++
++      q->ids[di] = (uint16_t) dma_id;
++      return 0;
++}
++
++
++void ef_vi_transmit_push(ef_vi* vi)
++{
++      ef_vi_wiob();
++      writel((vi->ep_state->txq.added & vi->vi_txq.mask) <<
++              __DW4(TX_DESC_WPTR_LBN),
++              vi->vi_txq.doorbell);
++}
++
++
++/*! The value of initial_rx_bytes is used to set RX_KER_BUF_SIZE in an initial
++**  receive descriptor here if physical addressing is being used. A value of
++**  zero represents 16384 bytes.  This is okay, because caller must provide a
++**  buffer than is > MTU, and mac should filter anything bigger than that.
++*/
++int ef_vi_receive_init(ef_vi* vi, ef_addr addr, ef_request_id dma_id,
++                     int initial_rx_bytes)
++{
++      ef_vi_rxq* q = &vi->vi_rxq;
++      ef_vi_rxq_state* qs = &vi->ep_state->rxq;
++      unsigned di;
++
++      if( ef_vi_receive_space(vi) ) {
++              di = qs->added++ & q->mask;
++              ef_assert_equal(q->ids[di], 0xffff);
++              q->ids[di] = (uint16_t) dma_id;
++
++              if( ! (vi->vi_flags & EF_VI_RX_PHYS_ADDR) ) {
++                      ef_vi_falcon_dma_rx_buf_desc* dp;
++                      dp = (ef_vi_falcon_dma_rx_buf_desc*) 
++                              q->descriptors + di;
++                      falcon_dma_rx_calc_ip_buf(ef_bufaddr(addr), dp);
++              }
++              else {
++                      ef_vi_falcon_dma_rx_phys_desc* dp;
++                      dp = (ef_vi_falcon_dma_rx_phys_desc*) 
++                              q->descriptors + di;
++                      __falcon_dma_rx_calc_ip_phys(addr, dp,
++                                                   initial_rx_bytes);
++              }
++
++              return 0;
++      }
++
++      return -EAGAIN;
++}
++
++
++int ef_vi_receive_post(ef_vi* vi, ef_addr addr, ef_request_id dma_id)
++{
++  int rc = ef_vi_receive_init(vi, addr, dma_id, 0);
++  if( rc == 0 )  ef_vi_receive_push(vi);
++  return rc;
++}
++
++
++void ef_vi_receive_push(ef_vi* vi)
++{
++      ef_vi_wiob();
++      writel ((vi->ep_state->rxq.added & vi->vi_rxq.mask) <<
++              __DW4(RX_DESC_WPTR_LBN),
++              vi->vi_rxq.doorbell);
++}
++
++
++ef_request_id ef_vi_receive_done(const ef_vi* vi, const ef_event* ef_ev)
++{
++      const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*ef_ev);
++      unsigned di = ev->u32[0] & vi->vi_rxq.mask;
++      ef_request_id rq_id;
++
++      ef_assert(EF_EVENT_TYPE(*ef_ev) == EF_EVENT_TYPE_RX ||
++                EF_EVENT_TYPE(*ef_ev) == EF_EVENT_TYPE_RX_DISCARD);
++
++      /* Detect spurious / duplicate RX events.  We may need to modify this
++      ** code so that we are robust if they happen. */
++      ef_assert_equal(di, vi->ep_state->rxq.removed & vi->vi_rxq.mask);
++
++      /* We only support 1 port: so events should be in order. */
++      ef_assert(vi->vi_rxq.ids[di] != 0xffff);
++
++      rq_id = vi->vi_rxq.ids[di];
++      vi->vi_rxq.ids[di] = 0xffff;
++      ++vi->ep_state->rxq.removed;
++      return rq_id;
++}
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netfront/pt_tx.c

index 0000000,0000000..bdc1f88

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/pt_tx.c
@@@ -1,0 -1,0 +1,91 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * \author  djr
++ *  \brief  Packet-mode transmit interface.
++ *   \date  2003/04/02
++ */
++
++/*! \cidoxg_lib_ef */
++#include "ef_vi_internal.h"
++
++
++int ef_vi_transmit_init(ef_vi* vi, ef_addr base, int len, ef_request_id dma_id)
++{
++      ef_iovec iov = { base, len };
++      return ef_vi_transmitv_init(vi, &iov, 1, dma_id);
++}
++
++
++int ef_vi_transmit(ef_vi* vi, ef_addr base, int len, ef_request_id dma_id)
++{
++      ef_iovec iov = { base, len };
++      int rc = ef_vi_transmitv_init(vi, &iov, 1, dma_id);
++      if( rc == 0 )  ef_vi_transmit_push(vi);
++      return rc;
++}
++
++
++int ef_vi_transmitv(ef_vi* vi, const ef_iovec* iov, int iov_len,
++                    ef_request_id dma_id)
++{
++      int rc = ef_vi_transmitv_init(vi, iov, iov_len, dma_id);
++      if( rc == 0 )  ef_vi_transmit_push(vi);
++      return rc;
++}
++
++
++int ef_vi_transmit_unbundle(ef_vi* vi, const ef_event* __ev,
++                          ef_request_id* ids)
++{
++      ef_request_id* ids_in = ids;
++      ef_vi_txq* q = &vi->vi_txq;
++      ef_vi_txq_state* qs = &vi->ep_state->txq;
++      const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*__ev);
++      unsigned i, stop = (ev->u32[0] + 1) & q->mask;
++
++      ef_assert(EF_EVENT_TYPE(*__ev) == EF_EVENT_TYPE_TX ||
++                EF_EVENT_TYPE(*__ev) == EF_EVENT_TYPE_TX_ERROR);
++
++      /* Shouldn't be batching more than 64 descriptors, and should not go
++      ** backwards. */
++      ef_assert_le((((ev->u32[0] + 1) - qs->removed) & q->mask), 64);
++      /* Should not complete more than we've posted. */
++      ef_assert_le((((ev->u32[0] + 1) - qs->removed) & q->mask),
++                   qs->added - qs->removed);
++
++      for( i = qs->removed & q->mask; i != stop; i = ++qs->removed & q->mask )
++              if( q->ids[i] != 0xffff ) {
++                      *ids++ = q->ids[i];
++                      q->ids[i] = 0xffff;
++              }
++
++      ef_assert_le(ids - ids_in, EF_VI_TRANSMIT_BATCH);
++
++      return (int) (ids - ids_in);
++}
++
++/*! \cidoxg_end */
diff --cc drivers/xen/sfc_netfront/sysdep.h

index 0000000,0000000..dc2234e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/sysdep.h
@@@ -1,0 -1,0 +1,185 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * \author  stg
++ *  \brief  System dependent support for ef vi lib
++ *   \date  2007/05/10
++ */
++
++/*! \cidoxg_include_ci_ul */
++#ifndef __CI_CIUL_SYSDEP_LINUX_H__
++#define __CI_CIUL_SYSDEP_LINUX_H__
++
++
++#define ef_vi_wiob()  mmiowb()
++
++
++/**********************************************************************
++ * Kernel version compatability
++ */
++
++#if defined(__GNUC__)
++
++/* Linux kernel doesn't have stdint.h or [u]intptr_t. */
++# if !defined(LINUX_VERSION_CODE)
++#  include <linux/version.h>
++# endif
++# include <asm/io.h>
++
++/* In Linux 2.6.24, linux/types.h has uintptr_t */
++# if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
++#  if BITS_PER_LONG == 32
++   typedef __u32         uintptr_t;
++#  else
++   typedef __u64         uintptr_t;
++#  endif
++# endif
++
++/* But even 2.6.24 doesn't define intptr_t */
++# if BITS_PER_LONG == 32
++   typedef __s32         intptr_t;
++# else
++   typedef __s64         intptr_t;
++# endif
++
++# if defined(__ia64__)
++#  define EF_VI_PRIx64  "lx"
++# else
++#  define EF_VI_PRIx64  "llx"
++# endif
++
++# define EF_VI_HF __attribute__((visibility("hidden")))
++# define EF_VI_HV __attribute__((visibility("hidden")))
++
++# if defined(__i386__) || defined(__x86_64__)  /* GCC x86/x64 */
++   typedef unsigned long long ef_vi_dma_addr_t; 
++# endif
++#endif
++
++#ifndef mmiowb
++# if defined(__i386__) || defined(__x86_64__)
++#  define mmiowb()
++# elif defined(__ia64__)
++#  ifndef ia64_mfa
++#   define ia64_mfa() asm volatile ("mf.a" ::: "memory")
++#  endif
++#  define mmiowb ia64_mfa
++# else
++#  error "Need definition for mmiowb"
++# endif
++#endif
++
++#ifdef EFX_NOT_UPSTREAM
++
++/* Stuff for architectures/compilers not officially supported */
++
++#if !defined(__GNUC__)
++# if defined(__PPC__)  /* GCC, PPC */
++   typedef unsigned long     ef_vi_dma_addr_t;
++
++#  ifdef __powerpc64__
++#   ifdef CONFIG_SMP
++#    define CI_SMP_SYNC        "\n   eieio     \n"         /* memory cache sync */
++#    define CI_SMP_ISYNC       "\n   isync     \n"         /* instr cache sync */
++#   else
++#    define CI_SMP_SYNC
++#    define CI_SMP_ISYNC
++#   endif
++#  else        /* for ppc32 systems */
++#   ifdef CONFIG_SMP
++#    define CI_SMP_SYNC        "\n   eieio     \n"
++#    define CI_SMP_ISYNC       "\n   sync      \n"
++#   else
++#    define CI_SMP_SYNC
++#    define CI_SMP_ISYNC
++#   endif
++#  endif
++
++# elif defined(__ia64__)  /* GCC, IA64 */
++   typedef unsigned long     ef_vi_dma_addr_t;
++# else
++#  error Unknown processor - GNU C
++# endif
++
++#elif defined(__PGI)
++# error PGI not supported 
++
++#elif defined(__INTEL_COMPILER)
++
++/* Intel compilers v7 claim to be very gcc compatible. */
++# if __INTEL_COMPILER >= 700
++#  if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ > 91)
++#   define EF_VI_LIKELY(t)    __builtin_expect((t), 1)
++#   define EF_VI_UNLIKELY(t)  __builtin_expect((t), 0)
++#  endif
++# else
++#  error Old Intel compiler not supported.
++# endif
++
++#else
++# error Unknown compiler.
++#endif
++
++#endif
++
++
++# include <linux/errno.h>
++
++
++/**********************************************************************
++ * Extracting bit fields.
++ */
++
++#define _QWORD_GET_LOW(f, v)                                    \
++  (((v).u32[0] >> (f##_LBN)) & ((1u << f##_WIDTH) - 1u))
++#define _QWORD_GET_HIGH(f, v)                                           \
++  (((v).u32[1] >> (f##_LBN - 32u)) & ((1u << f##_WIDTH) - 1u))
++#define _QWORD_GET_ANY(f, v)                                            \
++  (((v).u64[0] >> f##_LBN) & (((uint64_t) 1u << f##_WIDTH) - 1u))
++
++#define QWORD_GET(f, v)                                                     \
++  ((f##_LBN + f##_WIDTH) <= 32u                                             \
++   ? _QWORD_GET_LOW(f, (v))                                                 \
++   : ((f##_LBN >= 32u) ? _QWORD_GET_HIGH(f, (v)) : _QWORD_GET_ANY(f, (v))))
++
++#define QWORD_GET_U(f, v)  ((unsigned) QWORD_GET(f, (v)))
++
++#define _QWORD_TEST_BIT_LOW(f, v)   ((v).u32[0] & (1u << (f##_LBN)))
++#define _QWORD_TEST_BIT_HIGH(f, v)  ((v).u32[1] & (1u << (f##_LBN - 32u)))
++
++#define QWORD_TEST_BIT(f, v)                                                  \
++  (f##_LBN < 32 ? _QWORD_TEST_BIT_LOW(f, (v)) : _QWORD_TEST_BIT_HIGH(f, (v)))
++
++
++
++
++#ifndef DECLSPEC_NORETURN
++/* normally defined on Windows to expand to a declaration that the
++   function will not return */
++# define DECLSPEC_NORETURN
++#endif
++
++#endif  /* __CI_CIUL_SYSDEP_LINUX_H__ */
diff --cc drivers/xen/sfc_netfront/vi_init.c

index 0000000,0000000..4e7e19b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netfront/vi_init.c
@@@ -1,0 -1,0 +1,183 @@@
++/****************************************************************************
++ * Copyright 2002-2005: Level 5 Networks Inc.
++ * Copyright 2005-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications
++ *  <linux-xen-drivers@solarflare.com>
++ *  <onload-dev@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * \author  djr
++ *  \brief  Initialisation of VIs.
++ *   \date  2007/06/08
++ */
++
++#include "ef_vi_internal.h"
++
++#define EF_VI_STATE_BYTES(rxq_sz, txq_sz)                     \
++      (sizeof(ef_vi_state) + (rxq_sz) * sizeof(uint16_t)      \
++       + (txq_sz) * sizeof(uint16_t))
++
++int ef_vi_calc_state_bytes(int rxq_sz, int txq_sz)
++{
++      ef_assert(rxq_sz == 0 || EF_VI_IS_POW2(rxq_sz));
++      ef_assert(txq_sz == 0 || EF_VI_IS_POW2(txq_sz));
++
++      return EF_VI_STATE_BYTES(rxq_sz, txq_sz);
++}
++
++
++int ef_vi_state_bytes(ef_vi* vi)
++{
++      int rxq_sz = 0, txq_sz = 0;
++      if( ef_vi_receive_capacity(vi) )
++              rxq_sz = ef_vi_receive_capacity(vi) + 1;
++      if( ef_vi_transmit_capacity(vi) )
++              txq_sz = ef_vi_transmit_capacity(vi) + 1;
++
++      ef_assert(rxq_sz == 0 || EF_VI_IS_POW2(rxq_sz));
++      ef_assert(txq_sz == 0 || EF_VI_IS_POW2(txq_sz));
++
++      return EF_VI_STATE_BYTES(rxq_sz, txq_sz);
++}
++
++
++void ef_eventq_state_init(ef_vi* evq)
++{
++      int j;
++
++      for (j = 0; j<EFAB_DMAQS_PER_EVQ_MAX; j++) {
++              ef_rx_dup_state_t *rx_dup_state =
++                      &evq->evq_state->rx_dup_state[j];
++              rx_dup_state->bad_sop = 0;
++              rx_dup_state->rx_last_desc_ptr = -1;
++              rx_dup_state->frag_num = 0;
++      }
++
++      evq->evq_state->evq_ptr = 0;
++}
++
++
++void ef_vi_state_init(ef_vi* vi)
++{
++      ef_vi_state* state = vi->ep_state;
++      unsigned i;
++
++      state->txq.added = state->txq.removed = 0;
++      state->rxq.added = state->rxq.removed = 0;
++
++      if( vi->vi_rxq.mask )
++              for( i = 0; i <= vi->vi_rxq.mask; ++i )
++                      vi->vi_rxq.ids[i] = (uint16_t) -1;
++      if( vi->vi_txq.mask )
++              for( i = 0; i <= vi->vi_txq.mask; ++i )
++                      vi->vi_txq.ids[i] = (uint16_t) -1;
++}
++
++
++void ef_vi_init_mapping_evq(void* data_area, struct ef_vi_nic_type nic_type,
++                            int instance, unsigned evq_bytes, void* base,
++                            void* timer_reg)
++{
++      struct vi_mappings* vm = (struct vi_mappings*) data_area;
++
++      vm->signature = VI_MAPPING_SIGNATURE;
++      vm->vi_instance = instance;
++      vm->nic_type = nic_type;
++      vm->evq_bytes = evq_bytes;
++      vm->evq_base = base;
++      vm->evq_timer_reg = timer_reg;
++}
++
++
++void ef_vi_init(ef_vi* vi, void* vvis, ef_vi_state* state,
++                ef_eventq_state* evq_state, enum ef_vi_flags vi_flags)
++{
++      struct vi_mappings* vm = (struct vi_mappings*) vvis;
++
++      vi->vi_i = vm->vi_instance;
++      vi->ep_state = state;
++      vi->vi_flags = vi_flags;
++
++      switch( vm->nic_type.arch ) {
++      case EF_VI_ARCH_FALCON:
++              falcon_vi_init(vi, vvis);
++              break;
++      default:
++              /* ?? TODO: We should return an error code. */
++              ef_assert(0);
++              break;
++      }
++
++      if( vm->evq_bytes ) {
++              vi->evq_state = evq_state;
++              vi->evq_mask = vm->evq_bytes - 1u;
++              vi->evq_base = vm->evq_base;
++              vi->evq_timer_reg = vm->evq_timer_reg;
++      }
++
++      EF_VI_MAGIC_SET(vi, EF_VI);
++}
++
++
++/* Initialise [data_area] with information required to initialise an ef_vi.
++ * In the following, an unused param should be set to NULL. Note the case
++ * marked (*) of [iobuf_mmap] for falcon/driver; for the normal driver this
++ * must be NULL.
++ *
++ * \param  data_area     [in,out] required, must ref at least VI_MAPPING_SIZE 
++ *                                bytes
++ * \param  io_mmap       [in] ef1,    required
++ *                            falcon, required
++ * \param  iobuf_mmap    [in] ef1,    unused
++ *                            falcon, required
++ */
++void ef_vi_init_mapping_vi(void* data_area, struct ef_vi_nic_type nic_type,
++                           unsigned rxq_capacity, unsigned txq_capacity,
++                           int instance, void* io_mmap,
++                           void* iobuf_mmap_rx, void* iobuf_mmap_tx,
++                           enum ef_vi_flags vi_flags)
++{
++      struct vi_mappings* vm = (struct vi_mappings*) data_area;
++      int rx_desc_bytes, rxq_bytes;
++
++      ef_assert(rxq_capacity > 0 || txq_capacity > 0);
++      ef_assert(vm);
++      ef_assert(io_mmap);
++      ef_assert(iobuf_mmap_rx || iobuf_mmap_tx);
++
++      vm->signature = VI_MAPPING_SIGNATURE;
++      vm->vi_instance = instance;
++      vm->nic_type = nic_type;
++
++      rx_desc_bytes = (vi_flags & EF_VI_RX_PHYS_ADDR) ? 8 : 4;
++      rxq_bytes = rxq_capacity * rx_desc_bytes;
++      rxq_bytes = (rxq_bytes + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
++
++      if( iobuf_mmap_rx == iobuf_mmap_tx )
++              iobuf_mmap_tx = (char*) iobuf_mmap_rx + rxq_bytes;
++
++      vm->rx_queue_capacity = rxq_capacity;
++      vm->rx_dma_falcon = iobuf_mmap_rx;
++      vm->rx_bell       = (char*) io_mmap + (RX_DESC_UPD_REG_KER_OFST & 4095);
++      vm->tx_queue_capacity = txq_capacity;
++      vm->tx_dma_falcon = iobuf_mmap_tx;
++      vm->tx_bell       = (char*) io_mmap + (TX_DESC_UPD_REG_KER_OFST & 4095);
++}
diff --cc drivers/xen/sfc_netutil/Makefile

index 0000000,0000000..3fce370

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netutil/Makefile
@@@ -1,0 -1,0 +1,11 @@@
++EXTRA_CFLAGS += -Idrivers/xen/sfc_netutil
++EXTRA_CFLAGS += -Werror
++
++ifdef GGOV
++EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
++endif
++
++obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL) := sfc_netutil.o
++
++sfc_netutil-objs := accel_cuckoo_hash.o accel_msg_iface.o accel_util.o 
++
diff --cc drivers/xen/sfc_netutil/accel_cuckoo_hash.c

index 0000000,0000000..00454cb

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_cuckoo_hash.c
@@@ -1,0 -1,0 +1,649 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <linux/types.h> /* needed for linux/random.h */
++#include <linux/random.h>
++#include <linux/slab.h>
++
++#include "accel_cuckoo_hash.h"
++#include "accel_util.h"
++
++static inline int cuckoo_hash_key_compare(cuckoo_hash_table *hashtab,
++                                        cuckoo_hash_key *key1, 
++                                        cuckoo_hash_key *key2)
++{
++      return !memcmp(key1, key2, hashtab->key_length);
++}
++
++
++static inline void cuckoo_hash_key_set(cuckoo_hash_key *key1, 
++                                     cuckoo_hash_key *key2)
++{
++      *key1 = *key2;
++}
++
++
++/*
++ * Sets hash function parameters.  Chooses "a" to be odd, 0 < a < 2^w
++ * where w is the length of the key
++ */
++static void set_hash_parameters(cuckoo_hash_table *hashtab)
++{
++ again:
++      hashtab->a0 = hashtab->a1 = 0;
++
++      /* Make sure random */
++      get_random_bytes(&hashtab->a0, hashtab->key_length);
++      get_random_bytes(&hashtab->a1, hashtab->key_length);
++
++      /* Make sure odd */
++      hashtab->a0 |= 1;
++      hashtab->a1 |= 1;
++
++      /* Being different is good */
++      if (hashtab->a0 != hashtab->a1)
++              return;
++                     
++      goto again;
++}
++
++int cuckoo_hash_init(cuckoo_hash_table *hashtab, unsigned length_bits,
++                   unsigned key_length)
++{
++      char *table_mem;
++      unsigned length = 1 << length_bits;
++
++      BUG_ON(length_bits >= sizeof(unsigned) * 8);
++      BUG_ON(key_length > sizeof(cuckoo_hash_key));
++
++      table_mem = kzalloc(sizeof(cuckoo_hash_entry) * 2 * length, GFP_KERNEL);
++
++      if (table_mem == NULL)
++              return -ENOMEM;
++
++      hashtab->length = length;
++      hashtab->length_bits = length_bits;
++      hashtab->key_length = key_length;
++      hashtab->entries = 0;
++
++      hashtab->table0 = (cuckoo_hash_entry *)table_mem;
++      hashtab->table1 = (cuckoo_hash_entry *)
++              (table_mem + length * sizeof(cuckoo_hash_entry));
++
++      set_hash_parameters(hashtab);
++
++      return 0;
++}
++EXPORT_SYMBOL_GPL(cuckoo_hash_init);
++
++void cuckoo_hash_destroy(cuckoo_hash_table *hashtab)
++{
++      if (hashtab->table0 != NULL)
++              kfree(hashtab->table0);
++}
++
++EXPORT_SYMBOL_GPL(cuckoo_hash_destroy);
++
++/* 
++ * This computes sizeof(cuckoo_hash) bits of hash, not all will be
++ * necessarily used, but the hash function throws away any that
++ * aren't
++ */ 
++static inline void cuckoo_compute_hash_helper(cuckoo_hash_table *hashtab,
++                                            cuckoo_hash_key *a,
++                                            cuckoo_hash_key *x,
++                                            cuckoo_hash *result) 
++{
++      u64 multiply_result = 0, a_temp, x_temp;
++      u32 carry = 0;
++      u32 *a_words;
++      u32 *x_words;
++      int i;
++
++      /*
++       * As the mod and div operations in the function effectively
++       * reduce and shift the bits of the product down to just the
++       * third word, we need only compute that and return it as a
++       * result.
++       *
++       * Do enough long multiplication to get the word we need
++       */
++
++      /* This assumes things about the sizes of the key and hash */
++      BUG_ON(hashtab->key_length % sizeof(u32) != 0);
++      BUG_ON(sizeof(cuckoo_hash) != sizeof(u32));
++
++      a_words = (u32 *)a;
++      x_words = (u32 *)x;
++
++      for (i = 0; i < hashtab->key_length / sizeof(u32); i++) {
++              a_temp = a_words[i];
++              x_temp = x_words[i];
++              
++              multiply_result = (a_temp * x_temp) + carry;
++              carry = (multiply_result >> 32) & 0xffffffff;
++      }
++      
++      *result = multiply_result & 0xffffffff;
++}
++
++
++/*
++ * Want to implement (ax mod 2^w) div 2^(w-q) for odd a, 0 < a < 2^w;
++ * w is the length of the key, q is the length of the hash, I think.
++ * See http://www.it-c.dk/people/pagh/papers/cuckoo-jour.pdf 
++ */
++static cuckoo_hash cuckoo_compute_hash(cuckoo_hash_table *hashtab, 
++                                     cuckoo_hash_key *key, 
++                                     cuckoo_hash_key *a)
++{
++      unsigned q = hashtab->length_bits;
++      unsigned shift = 32 - q;
++      unsigned mask = ((1 << q) - 1) << shift;
++      cuckoo_hash hash;
++
++      cuckoo_compute_hash_helper(hashtab, a, key, &hash);
++
++      /* 
++       * Take the top few bits to get the right length for this
++       * hash table 
++       */
++      hash = (hash & mask) >> shift;
++
++      BUG_ON(hash >= hashtab->length);
++
++      return hash;
++}
++
++
++static int cuckoo_hash_lookup0(cuckoo_hash_table *hashtab,
++                             cuckoo_hash_key *key,
++                             cuckoo_hash_value *value)
++{
++      cuckoo_hash hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
++
++      if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED)
++          && cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
++                                     key)) {
++              *value = hashtab->table0[hash].value;
++              return 1;
++      }
++
++      return 0;
++}
++
++static int cuckoo_hash_lookup1(cuckoo_hash_table *hashtab,
++                             cuckoo_hash_key *key,
++                             cuckoo_hash_value *value)
++{
++      cuckoo_hash hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
++
++      if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED)
++          && cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
++                                     key)) {
++              *value = hashtab->table1[hash].value;
++              return 1;
++      }
++
++      return 0;
++}
++
++
++int cuckoo_hash_lookup(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
++                     cuckoo_hash_value *value)
++{
++      return cuckoo_hash_lookup0(hashtab, key, value)
++              || cuckoo_hash_lookup1(hashtab, key, value);
++}
++EXPORT_SYMBOL_GPL(cuckoo_hash_lookup);
++
++
++/* Transfer any active entries from "old_table" into hashtab */
++static int cuckoo_hash_transfer_entries(cuckoo_hash_table *hashtab,
++                                      cuckoo_hash_entry *old_table,
++                                      unsigned capacity)
++{
++      int i, rc;
++      cuckoo_hash_entry *entry;
++
++      hashtab->entries = 0;
++
++      for (i = 0; i < capacity; i++) {
++              entry = &old_table[i];
++              if (entry->state == CUCKOO_HASH_STATE_OCCUPIED) {
++                      rc = cuckoo_hash_add(hashtab, &(entry->key), 
++                                           entry->value, 0);
++                      if (rc != 0) {
++                              return rc;
++                      }
++              }
++      }
++  
++      return 0;
++}
++
++
++int cuckoo_hash_rehash(cuckoo_hash_table *hashtab)
++{
++      cuckoo_hash_entry *new_table;
++      cuckoo_hash_table old_hashtab;
++      int resize = 0, rc, rehash_count;
++
++      /*
++       * Store old tables so we can access the existing values and
++       * copy across
++       */
++      memcpy(&old_hashtab, hashtab, sizeof(cuckoo_hash_table));
++
++      /* resize if hashtable is more than half full */
++      if (old_hashtab.entries > old_hashtab.length &&
++          old_hashtab.length_bits < 32)
++              resize = 1;
++
++ resize:
++      if (resize) {
++              new_table = kmalloc(sizeof(cuckoo_hash_entry) * 4 * hashtab->length,
++                                  GFP_ATOMIC);
++              if (new_table == NULL) {
++                      rc = -ENOMEM;
++                      goto err;
++              }
++
++              hashtab->length = 2 * hashtab->length;
++              hashtab->length_bits++;
++      } else {
++              new_table = kmalloc(sizeof(cuckoo_hash_entry) * 2 * hashtab->length,
++                                  GFP_ATOMIC);
++              if (new_table == NULL) {
++                      rc = -ENOMEM;
++                      goto err;
++              }
++      }
++    
++      /*
++       * Point hashtab to new memory region so we can try to
++       * construct new table
++       */
++      hashtab->table0 = new_table;
++      hashtab->table1 = (cuckoo_hash_entry *)
++              ((char *)new_table + hashtab->length * sizeof(cuckoo_hash_entry));
++  
++      rehash_count = 0;
++
++ again:
++      /* Zero the new tables */
++      memset(new_table, 0, hashtab->length * 2 * sizeof(cuckoo_hash_entry));
++
++      /* Choose new parameters for the hash functions */
++      set_hash_parameters(hashtab);
++
++      /*
++       * Multiply old_table_length by 2 as the length refers to each
++       * table, and there are two of them.  This assumes that they
++       * are arranged sequentially in memory, so assert it 
++       */
++      BUG_ON(((char *)old_hashtab.table1) != 
++             ((char *)old_hashtab.table0 + old_hashtab.length
++              * sizeof(cuckoo_hash_entry)));
++      rc = cuckoo_hash_transfer_entries(hashtab, old_hashtab.table0, 
++                                        old_hashtab.length * 2);
++      if (rc < 0) {
++              /* Problem */
++              if (rc == -ENOSPC) {
++                      ++rehash_count;
++                      if (rehash_count < CUCKOO_HASH_MAX_LOOP) {
++                              /*
++                               * Wanted to rehash, but rather than
++                               * recurse we can just do it here
++                               */
++                              goto again;
++                      } else {
++                              /*
++                               * Didn't manage to rehash, so let's
++                               * go up a size (if we haven't already
++                               * and there's space)
++                               */
++                              if (!resize && hashtab->length_bits < 32) {
++                                      resize = 1;
++                                      kfree(new_table);
++                                      goto resize;
++                              }
++                              else
++                                      goto err;
++                      }
++              }
++              else
++                      goto err;
++      }
++
++      /* Success, I think.  Free up the old table */
++      kfree(old_hashtab.table0);
++  
++      /* We should have put all the entries from old table in the new one */
++      BUG_ON(hashtab->entries != old_hashtab.entries);
++
++      return 0;
++ err:
++      EPRINTK("%s: Rehash failed, giving up\n", __FUNCTION__);
++      /* Some other error, give up, at least restore table to how it was */
++      memcpy(hashtab, &old_hashtab, sizeof(cuckoo_hash_table));
++      if (new_table)
++              kfree(new_table);
++      return rc;
++}
++EXPORT_SYMBOL_GPL(cuckoo_hash_rehash);
++
++
++static int 
++cuckoo_hash_insert_or_displace(cuckoo_hash_entry *table, unsigned hash,
++                             cuckoo_hash_key *key, 
++                             cuckoo_hash_value value,
++                             cuckoo_hash_key *displaced_key, 
++                             cuckoo_hash_value *displaced_value)
++{
++      if (table[hash].state == CUCKOO_HASH_STATE_VACANT) {
++              cuckoo_hash_key_set(&(table[hash].key), key);
++              table[hash].value = value;
++              table[hash].state = CUCKOO_HASH_STATE_OCCUPIED;
++
++              return 1;
++      } else {
++              cuckoo_hash_key_set(displaced_key, &(table[hash].key));
++              *displaced_value = table[hash].value;
++              cuckoo_hash_key_set(&(table[hash].key), key);
++              table[hash].value = value;
++
++              return 0;
++      }
++}
++
++
++int cuckoo_hash_add(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
++                   cuckoo_hash_value value, int can_rehash)
++{
++      cuckoo_hash hash0, hash1;
++      int i, rc;
++      cuckoo_hash_key key1, key2;
++
++      cuckoo_hash_key_set(&key1, key);
++
++ again:
++      i = 0;
++      do {
++              hash0 = cuckoo_compute_hash(hashtab, &key1, &hashtab->a0);
++              if (cuckoo_hash_insert_or_displace(hashtab->table0, hash0, 
++                                                 &key1, value, &key2,
++                                                 &value)) {
++                      /* Success */
++                      hashtab->entries++;
++                      return 0;
++              }
++      
++              hash1 = cuckoo_compute_hash(hashtab, &key2, &hashtab->a1);
++              if (cuckoo_hash_insert_or_displace(hashtab->table1, hash1,
++                                                 &key2, value, &key1,
++                                                 &value)) {
++                      /* Success */
++                      hashtab->entries++;
++                      return 0;
++              }
++      } while (++i < CUCKOO_HASH_MAX_LOOP);
++
++      if (can_rehash) {
++              if ((rc = cuckoo_hash_rehash(hashtab)) < 0) {
++                      /*
++                       * Give up - this will drop whichever
++                       * key/value pair we have currently displaced
++                       * on the floor
++                       */
++                      return rc;
++              }
++              goto again;
++      }
++  
++      EPRINTK("%s: failed hash add\n", __FUNCTION__);
++      /*
++       * Couldn't do it - bad as we've now removed some random thing
++       * from the table, and will just drop it on the floor.  Better
++       * would be to somehow revert the table to the state it was in
++       * at the start
++       */
++      return -ENOSPC;
++}
++EXPORT_SYMBOL_GPL(cuckoo_hash_add);
++
++
++int cuckoo_hash_add_check(cuckoo_hash_table *hashtab,
++                        cuckoo_hash_key *key, cuckoo_hash_value value,
++                        int can_rehash)
++{
++      int stored_value;
++
++      if (cuckoo_hash_lookup(hashtab, key, &stored_value))
++              return -EBUSY;
++
++      return cuckoo_hash_add(hashtab, key, value, can_rehash);
++}
++EXPORT_SYMBOL_GPL(cuckoo_hash_add_check);
++
++
++int cuckoo_hash_remove(cuckoo_hash_table *hashtab, cuckoo_hash_key *key)
++{
++      cuckoo_hash hash;
++
++      hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
++      if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
++          cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
++                                  key)) {
++              hashtab->table0[hash].state = CUCKOO_HASH_STATE_VACANT;
++              hashtab->entries--;
++              return 0;
++      }
++  
++      hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
++      if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
++          cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
++                                  key)) {
++              hashtab->table1[hash].state = CUCKOO_HASH_STATE_VACANT;
++              hashtab->entries--;
++              return 0;
++      }
++ 
++      return -EINVAL;
++}
++EXPORT_SYMBOL_GPL(cuckoo_hash_remove);
++
++
++int cuckoo_hash_update(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
++                     cuckoo_hash_value value)
++{
++      cuckoo_hash hash;
++
++      hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
++      if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
++          cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
++                                  key)) {
++              hashtab->table0[hash].value = value;
++              return 0;
++      }
++
++      hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
++      if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
++          cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
++                                  key)) {
++              hashtab->table1[hash].value = value;
++              return 0;
++      }
++
++      return -EINVAL;
++}
++EXPORT_SYMBOL_GPL(cuckoo_hash_update);
++
++
++void cuckoo_hash_iterate_reset(cuckoo_hash_table *hashtab)
++{
++      hashtab->iterate_index = 0;
++}
++EXPORT_SYMBOL_GPL(cuckoo_hash_iterate_reset);
++
++
++int cuckoo_hash_iterate(cuckoo_hash_table *hashtab,
++                      cuckoo_hash_key *key, cuckoo_hash_value *value)
++{
++      unsigned index;
++
++      while (hashtab->iterate_index < hashtab->length) {
++              index = hashtab->iterate_index;
++              ++hashtab->iterate_index;
++              if (hashtab->table0[index].state == CUCKOO_HASH_STATE_OCCUPIED) {
++                      *key = hashtab->table0[index].key;
++                      *value = hashtab->table0[index].value;
++                      return 0;
++              }
++      }
++
++      while (hashtab->iterate_index >= hashtab->length &&
++             hashtab->iterate_index < hashtab->length * 2) {
++              index = hashtab->iterate_index - hashtab->length;
++              ++hashtab->iterate_index;               
++              if (hashtab->table1[index].state == CUCKOO_HASH_STATE_OCCUPIED) {
++                      *key = hashtab->table1[index].key;
++                      *value = hashtab->table1[index].value;
++                      return 0;
++              }
++      }
++
++      return -ENOSPC;
++}
++EXPORT_SYMBOL_GPL(cuckoo_hash_iterate);
++
++
++#if 0
++void cuckoo_hash_valid(cuckoo_hash_table *hashtab)
++{
++      int i, entry_count = 0;
++
++      for (i=0; i < hashtab->length; i++) {
++              EPRINTK_ON(hashtab->table0[i].state != CUCKOO_HASH_STATE_VACANT &&
++                         hashtab->table0[i].state != CUCKOO_HASH_STATE_OCCUPIED);
++              if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
++                      entry_count++;
++              EPRINTK_ON(hashtab->table1[i].state != CUCKOO_HASH_STATE_VACANT &&
++                         hashtab->table1[i].state != CUCKOO_HASH_STATE_OCCUPIED);
++              if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
++                      entry_count++;  
++      }
++      
++      if (entry_count != hashtab->entries) {
++              EPRINTK("%s: bad count\n", __FUNCTION__);
++              cuckoo_hash_dump(hashtab);
++              return;
++      }
++
++      for (i=0; i< hashtab->length; i++) {
++              if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
++                      if (i != cuckoo_compute_hash(hashtab, 
++                                                   &hashtab->table0[i].key, 
++                                                   &hashtab->a0)) {
++                              EPRINTK("%s: Bad key table 0 index %d\n",
++                                      __FUNCTION__, i);
++                              cuckoo_hash_dump(hashtab);
++                              return;
++                      }
++              if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
++                      if (i != cuckoo_compute_hash(hashtab, 
++                                                   &hashtab->table1[i].key, 
++                                                   &hashtab->a1)) {
++                              EPRINTK("%s: Bad key table 1 index %d\n",
++                                      __FUNCTION__, i);
++                              cuckoo_hash_dump(hashtab);
++                              return;
++                      }
++      }
++
++}
++EXPORT_SYMBOL_GPL(cuckoo_hash_valid);
++
++
++void cuckoo_hash_dump(cuckoo_hash_table *hashtab)
++{
++      int i, entry_count;
++
++      entry_count = 0;
++      for (i=0; i < hashtab->length; i++) {
++              EPRINTK_ON(hashtab->table0[i].state != CUCKOO_HASH_STATE_VACANT &&
++                         hashtab->table0[i].state != CUCKOO_HASH_STATE_OCCUPIED);
++              if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
++                      entry_count++;
++              EPRINTK_ON(hashtab->table1[i].state != CUCKOO_HASH_STATE_VACANT &&
++                         hashtab->table1[i].state != CUCKOO_HASH_STATE_OCCUPIED);
++              if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
++                      entry_count++;  
++      }
++
++      EPRINTK("======================\n");
++      EPRINTK("Cuckoo hash table dump\n");
++      EPRINTK("======================\n");
++      EPRINTK("length: %d; length_bits: %d; key_length: %d\n", hashtab->length,
++              hashtab->length_bits, hashtab->key_length);
++      EPRINTK("Recorded entries: %d\n", hashtab->entries);
++      EPRINTK("Counted entries: %d\n", entry_count);
++      EPRINTK("a0: %llx; a1: %llx\n", hashtab->a0, hashtab->a1);
++      EPRINTK("-----------------------------------------\n");
++      EPRINTK("Index  Occupied  Key  Value Index0 Index1\n");
++      EPRINTK("-----------------------------------------\n");         
++      for (i=0; i< hashtab->length; i++) {
++              if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
++              EPRINTK("%d %d %llx %d %d %d\n", i,
++                      hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED,
++                      hashtab->table0[i].key, hashtab->table0[i].value,
++                      cuckoo_compute_hash(hashtab, &hashtab->table0[i].key, 
++                                          &hashtab->a0),
++                      cuckoo_compute_hash(hashtab, &hashtab->table0[i].key, 
++                                          &hashtab->a1));
++              else
++              EPRINTK("%d %d - - - -\n", i,
++                      hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED);
++                      
++      }
++      EPRINTK("-----------------------------------------\n");
++      EPRINTK("Index  Occupied  Key  Value Index0 Index1\n");
++      EPRINTK("-----------------------------------------\n");
++      for (i=0; i< hashtab->length; i++) {
++              if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
++              EPRINTK("%d %d %llx %d %d %d\n", i,
++                      hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED,
++                      hashtab->table1[i].key, hashtab->table1[i].value,
++                      cuckoo_compute_hash(hashtab, &hashtab->table1[i].key, 
++                                          &hashtab->a0),
++                      cuckoo_compute_hash(hashtab, &hashtab->table1[i].key, 
++                                          &hashtab->a1));
++              else
++              EPRINTK("%d %d - - - -\n", i,
++                      hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED);
++      } 
++      EPRINTK("======================\n");
++}
++EXPORT_SYMBOL_GPL(cuckoo_hash_dump);
++#endif
diff --cc drivers/xen/sfc_netutil/accel_cuckoo_hash.h

index 0000000,0000000..83518f9

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_cuckoo_hash.h
@@@ -1,0 -1,0 +1,227 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++/*
++ * A cuckoo hash table consists of two sub tables.  Each entry can
++ * hash to a position in each table.  If, on entry, its position is
++ * found to be occupied, the existing element is moved to it's other
++ * location.  This recurses until success or a loop is found.  If a
++ * loop is found the table is rehashed.
++ *
++ *  See http://www.it-c.dk/people/pagh/papers/cuckoo-jour.pdf
++ */
++
++#ifndef NET_ACCEL_CUCKOO_HASH_H
++#define NET_ACCEL_CUCKOO_HASH_H
++
++/*! Type used for hash table keys of ip pairs */
++typedef struct {
++      u32 local_ip;
++      //u32 remote_ip;
++      u16 local_port;
++      //u16 remote_port;
++      /* Technically only 1 bit, but use 16 to make key a round
++         number size */
++      u16 proto;
++} cuckoo_hash_ip_key;
++
++/*! Type used for hash table keys of mac addresses */
++typedef u64 cuckoo_hash_mac_key;
++
++/*! This type is designed to be large enough to hold all supported key
++ *  sizes to avoid having to malloc storage for them.
++ */
++typedef u64 cuckoo_hash_key;
++
++/*! Type used for the values stored in the hash table */
++typedef int cuckoo_hash_value;
++
++/*! Type used for the hash used to index the table */
++typedef u32 cuckoo_hash;
++
++/*! How long to spend displacing values when adding before giving up
++ *  and rehashing */
++#define CUCKOO_HASH_MAX_LOOP (hashtab->length)
++
++/*! State of hash table entry */
++typedef enum {
++      CUCKOO_HASH_STATE_VACANT = 0,
++      CUCKOO_HASH_STATE_OCCUPIED 
++} cuckoo_hash_state;
++
++/*! An entry in the hash table */
++typedef struct {
++      cuckoo_hash_state state;
++      cuckoo_hash_key key;
++      cuckoo_hash_value value;
++} cuckoo_hash_entry;
++
++/*! A cuckoo hash table */
++typedef struct {
++      /*! The length of each table (NB. there are two tables of this
++       *  length) */
++      unsigned length; 
++      /*! The length of each table in bits */
++      unsigned length_bits;
++      /*! The length of the key in bytes */ 
++      unsigned key_length; 
++      /*! The number of entries currently stored in the table */
++      unsigned entries;
++      /*! Index into table used by cuckoo_hash_iterate */
++      unsigned iterate_index; 
++
++      /* parameter of hash functions */
++      /*! The "a" parameter of the first hash function */
++      cuckoo_hash_key a0; 
++      /*! The "a" parameter of the second hash function */
++      cuckoo_hash_key a1; 
++
++      /*! The first table */
++      cuckoo_hash_entry *table0; 
++      /*! The second table */
++      cuckoo_hash_entry *table1; 
++} cuckoo_hash_table;
++
++/*! Initialise the cuckoo has table 
++ *
++ * \param hashtab A pointer to an unitialised hash table structure
++ * \param length_bits The number of elements in each table equals
++ * 2**length_bits
++ * \param key_length The length of the key in bytes
++ *
++ * \return 0 on success, -ENOMEM if it couldn't allocate the tables
++ */
++extern
++int cuckoo_hash_init(cuckoo_hash_table *hashtab, unsigned length_bits,
++                   unsigned key_length);
++
++
++/*! Destroy a hash table
++ *
++ * \param hashtab A hash table that has previously been passed to a
++ * successful call of cuckoo_hash_init()
++ */
++extern
++void cuckoo_hash_destroy(cuckoo_hash_table *hashtab);
++
++
++/*! Lookup an entry in the hash table 
++ *
++ * \param hashtab The hash table in which to look.
++ * \param key Pointer to a mac address to use as the key
++ * \param value On exit set to the value stored if key was present
++ *
++ * \return 0 if not present in the table, non-zero if it is (and value
++ * is set accordingly)
++ */
++extern
++int cuckoo_hash_lookup(cuckoo_hash_table *hashtab,
++                     cuckoo_hash_key *key,
++                     cuckoo_hash_value *value);
++
++/*! Add an entry to the hash table.  Key must not be a duplicate of
++ * anything already in the table.  If this is a risk, see
++ * cuckoo_hash_add_check
++ *
++ * \param hashtab The hash table to add the entry to
++ * \param key Pointer to a mac address to use as a key
++ * \param value The value to store 
++ * \param can_rehash Flag to allow the add function to rehash the
++ * table if necessary
++ *
++ * \return 0 on success, non-zero on failure.  -ENOSPC means it just
++ * couldn't find anywhere to put it - this is bad and probably means
++ * an entry has been dropped on the floor (but the entry you just
++ * tried to add may now be included)
++ */
++extern
++int cuckoo_hash_add(cuckoo_hash_table *hashtab,
++                  cuckoo_hash_key *key, 
++                  cuckoo_hash_value value,
++                  int can_rehash);
++
++/*! Same as cuckoo_hash_add but first checks to ensure entry is not
++ * already there
++ * \return -EBUSY if already there
++ */
++
++extern
++int cuckoo_hash_add_check(cuckoo_hash_table *hashtab,
++                        cuckoo_hash_key *key, 
++                        cuckoo_hash_value value,
++                        int can_rehash);
++/*! Remove an entry from the table 
++ *
++ * \param hashtab The hash table to remove the entry from
++ * \param key The key that was used to previously add the entry
++ *
++ * \return 0 on success, -EINVAL if the entry couldn't be found 
++ */
++extern
++int cuckoo_hash_remove(cuckoo_hash_table *hashtab, cuckoo_hash_key *key);
++
++
++/*! Helper for those using mac addresses to convert to a key for the
++ *  hash table
++ */
++static inline cuckoo_hash_mac_key cuckoo_mac_to_key(const u8 *mac)
++{
++      return (cuckoo_hash_mac_key)(mac[0])
++              | (cuckoo_hash_mac_key)(mac[1]) << 8
++              | (cuckoo_hash_mac_key)(mac[2]) << 16
++              | (cuckoo_hash_mac_key)(mac[3]) << 24
++              | (cuckoo_hash_mac_key)(mac[4]) << 32
++              | (cuckoo_hash_mac_key)(mac[5]) << 40;
++}
++
++
++/*! Update an entry already in the hash table to take a new value 
++ *
++ * \param hashtab The hash table to add the entry to
++ * \param key Pointer to a mac address to use as a key
++ * \param value The value to store 
++ *
++ * \return 0 on success, non-zero on failure. 
++ */
++int cuckoo_hash_update(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
++                     cuckoo_hash_value value);
++
++
++/*! Go through the hash table and return all used entries (one per call)
++ *
++ * \param hashtab The hash table to iterate over 
++ * \param key Pointer to a key to take the returned key
++ * \param value Pointer to a value to take the returned value
++ *
++ * \return 0 on success (key, value set), non-zero on failure.
++ */
++int cuckoo_hash_iterate(cuckoo_hash_table *hashtab,
++                      cuckoo_hash_key *key, cuckoo_hash_value *value);
++void cuckoo_hash_iterate_reset(cuckoo_hash_table *hashtab);
++
++/* debug, not compiled by default */
++void cuckoo_hash_valid(cuckoo_hash_table *hashtab);
++void cuckoo_hash_dump(cuckoo_hash_table *hashtab);
++
++#endif /* NET_ACCEL_CUCKOO_HASH_H */
diff --cc drivers/xen/sfc_netutil/accel_msg_iface.c

index 0000000,0000000..e52de14

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_msg_iface.c
@@@ -1,0 -1,0 +1,301 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <xen/evtchn.h>
++
++#include "accel_util.h"
++#include "accel_msg_iface.h"
++
++#define NET_ACCEL_MSG_Q_SIZE (1024)
++#define NET_ACCEL_MSG_Q_MASK (NET_ACCEL_MSG_Q_SIZE - 1)
++
++#ifdef NDEBUG
++#define NET_ACCEL_CHECK_MAGIC(_p, _errval)
++#define NET_ACCEL_SHOW_QUEUE(_t, _q, _id)
++#else
++#define NET_ACCEL_CHECK_MAGIC(_p, _errval)                            \
++      if (_p->magic != NET_ACCEL_MSG_MAGIC) {                         \
++              pr_err("%s: passed invalid shared page %p!\n",          \
++                     __FUNCTION__, _p);                               \
++              return _errval;                                         \
++      }
++#define NET_ACCEL_SHOW_QUEUE(_t, _q, _id)                             \
++      printk(_t ": queue %d write %x read %x base %x limit %x\n",     \
++             _id, _q->write, _q->read, _q->base, _q->limit);
++#endif
++
++/*
++ * We've been passed at least 2 pages. 1 control page and 1 or more
++ * data pages.
++ */
++int net_accel_msg_init_page(void *mem, int len, int up)
++{
++      struct net_accel_shared_page *shared_page = 
++              (struct net_accel_shared_page*)mem;
++
++      if ((unsigned long)shared_page & NET_ACCEL_MSG_Q_MASK)
++              return -EINVAL;
++
++      shared_page->magic = NET_ACCEL_MSG_MAGIC;
++
++      shared_page->aflags = 0;
++
++      shared_page->net_dev_up = up;
++
++      return 0;
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_init_page);
++
++
++void net_accel_msg_init_queue(sh_msg_fifo2 *queue,
++                            struct net_accel_msg_queue *indices,
++                            struct net_accel_msg *base, int size)
++{
++      queue->fifo = base;
++      spin_lock_init(&queue->lock);
++      sh_fifo2_init(queue, size-1, &indices->read, &indices->write);
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_init_queue);
++
++
++static inline int _net_accel_msg_send(struct net_accel_shared_page *sp,
++                                    sh_msg_fifo2 *queue,
++                                    struct net_accel_msg *msg,
++                                    int is_reply)
++{
++      int rc = 0;
++      NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
++      rmb();
++      if (is_reply) {
++              EPRINTK_ON(sh_fifo2_is_full(queue));
++              sh_fifo2_put(queue, *msg);
++      } else {
++              if (sh_fifo2_not_half_full(queue)) {
++                      sh_fifo2_put(queue, *msg);
++              } else {
++                      rc = -ENOSPC;
++              }
++      }
++      wmb();
++      return rc;
++}
++
++/* Notify after a batch of messages have been sent */
++void net_accel_msg_notify(int irq)
++{
++      notify_remote_via_irq(irq);
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_notify);
++
++/* 
++ * Send a message on the specified FIFO. Returns 0 on success, -errno
++ * on failure. The message in msg is copied to the current slot of the
++ * FIFO.
++ */
++int net_accel_msg_send(struct net_accel_shared_page *sp, sh_msg_fifo2 *q, 
++                     struct net_accel_msg *msg)
++{
++      unsigned long flags;
++      int rc;
++      net_accel_msg_lock_queue(q, &flags);
++      rc = _net_accel_msg_send(sp, q, msg, 0);
++      net_accel_msg_unlock_queue(q, &flags);
++      return rc;
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_send);
++
++
++/* As net_accel_msg_send but also posts a notification to the far end. */
++int net_accel_msg_send_notify(struct net_accel_shared_page *sp, int irq, 
++                            sh_msg_fifo2 *q, struct net_accel_msg *msg)
++{
++      unsigned long flags;
++      int rc;
++      net_accel_msg_lock_queue(q, &flags);
++      rc = _net_accel_msg_send(sp, q, msg, 0);
++      net_accel_msg_unlock_queue(q, &flags);
++      if (rc >= 0)
++              notify_remote_via_irq(irq);
++      return rc;
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_send_notify);
++
++
++int net_accel_msg_reply(struct net_accel_shared_page *sp, sh_msg_fifo2 *q, 
++                     struct net_accel_msg *msg)
++{
++      unsigned long flags;
++      int rc;
++      net_accel_msg_lock_queue(q, &flags);
++      rc = _net_accel_msg_send(sp, q, msg, 1);
++      net_accel_msg_unlock_queue(q, &flags);
++      return rc;
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_reply);
++
++
++/* As net_accel_msg_send but also posts a notification to the far end. */
++int net_accel_msg_reply_notify(struct net_accel_shared_page *sp, int irq, 
++                            sh_msg_fifo2 *q, struct net_accel_msg *msg)
++{
++      unsigned long flags;
++      int rc;
++      net_accel_msg_lock_queue(q, &flags);
++      rc = _net_accel_msg_send(sp, q, msg, 1);
++      net_accel_msg_unlock_queue(q, &flags);
++      if (rc >= 0)
++              notify_remote_via_irq(irq);
++      return rc;
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_reply_notify);
++
++
++/*
++ * Look at a received message, if any, so a decision can be made about
++ * whether to read it now or not.  Cookie is a bit of debug which is
++ * set here and checked when passed to net_accel_msg_recv_next()
++ */
++int net_accel_msg_peek(struct net_accel_shared_page *sp, 
++                     sh_msg_fifo2 *queue, 
++                     struct net_accel_msg *msg, int *cookie)
++{
++      unsigned long flags;
++      int rc = 0;
++      NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
++      net_accel_msg_lock_queue(queue, &flags);
++      rmb();
++      if (sh_fifo2_is_empty(queue)) {
++              rc = -ENOENT;
++      } else {
++              *msg = sh_fifo2_peek(queue);
++              *cookie = *(queue->fifo_rd_i);
++      }
++      net_accel_msg_unlock_queue(queue, &flags);
++      return rc;
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_peek);
++
++
++/*
++ * Move the queue onto the next element, used after finished with a
++ * peeked msg 
++ */
++int net_accel_msg_recv_next(struct net_accel_shared_page *sp, 
++                          sh_msg_fifo2 *queue, int cookie)
++{
++      unsigned long flags;
++      NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
++      net_accel_msg_lock_queue(queue, &flags);
++      rmb();
++      /* Mustn't be empty */
++      BUG_ON(sh_fifo2_is_empty(queue));
++      /* 
++       * Check cookie matches, i.e. we're advancing over the same message
++       * as was got using peek 
++       */
++      BUG_ON(cookie != *(queue->fifo_rd_i));
++      sh_fifo2_rd_next(queue);
++      wmb();
++      net_accel_msg_unlock_queue(queue, &flags);
++      return 0;
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_recv_next);
++
++
++/* 
++ * Receive a message on the specified FIFO. Returns 0 on success,
++ * -errno on failure.
++ */
++int net_accel_msg_recv(struct net_accel_shared_page *sp, sh_msg_fifo2 *queue, 
++                     struct net_accel_msg *msg)
++{
++      unsigned long flags;
++      int rc = 0;
++      NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
++      net_accel_msg_lock_queue(queue, &flags);
++      rmb();
++      if (sh_fifo2_is_empty(queue)) {
++              rc = -ENOENT;
++      } else {
++              sh_fifo2_get(queue, msg);
++      }
++      wmb();
++      net_accel_msg_unlock_queue(queue, &flags);
++      return rc;
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_recv);
++
++
++/* 
++ * Start sending a message without copying. returns a pointer to a message
++ * that will be filled out in place. The queue is locked until the message 
++ * is sent.
++ */
++struct net_accel_msg *net_accel_msg_start_send(struct net_accel_shared_page *sp,
++                                             sh_msg_fifo2 *queue, unsigned long *flags)
++{
++      struct net_accel_msg *msg;
++      NET_ACCEL_CHECK_MAGIC(sp, NULL);
++      net_accel_msg_lock_queue(queue, flags);
++      rmb();
++      if (sh_fifo2_not_half_full(queue)) {
++              msg = sh_fifo2_pokep(queue);
++      } else {
++              net_accel_msg_unlock_queue(queue, flags);
++              msg = NULL;
++      }
++      return msg;
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_start_send);
++
++
++static inline void _msg_complete(struct net_accel_shared_page *sp,
++                               sh_msg_fifo2 *queue,
++                               unsigned long *flags)
++{
++      sh_fifo2_wr_next(queue);
++      net_accel_msg_unlock_queue(queue, flags);
++}
++
++/*
++ * Complete the sending of a message started with net_accel_msg_start_send. The 
++ * message is implicit since the queue was locked by _start
++ */
++void net_accel_msg_complete_send(struct net_accel_shared_page *sp,
++                               sh_msg_fifo2 *queue,
++                               unsigned long *flags)
++{
++      _msg_complete(sp, queue, flags);
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_complete_send);
++
++/* As net_accel_msg_complete_send but does the notify. */
++void net_accel_msg_complete_send_notify(struct net_accel_shared_page *sp, 
++                                      sh_msg_fifo2 *queue, 
++                                      unsigned long *flags, int irq)
++{
++      _msg_complete(sp, queue, flags);
++      notify_remote_via_irq(irq);
++}
++EXPORT_SYMBOL_GPL(net_accel_msg_complete_send_notify);
diff --cc drivers/xen/sfc_netutil/accel_msg_iface.h

index 0000000,0000000..0483a56

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_msg_iface.h
@@@ -1,0 -1,0 +1,415 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#ifndef NET_ACCEL_MSG_IFACE_H
++#define NET_ACCEL_MSG_IFACE_H
++
++#include <linux/ip.h>
++#include <linux/tcp.h>
++#include <linux/udp.h>
++#include <linux/in.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++
++#include "accel_shared_fifo.h"
++
++#define NET_ACCEL_MSG_MAGIC (0x85465479)
++
++/*! We talk version 0.010 of the interdomain protocol */
++#define NET_ACCEL_MSG_VERSION (0x00001000)
++
++/*! Shared memory portion of inter-domain FIFO */
++struct net_accel_msg_queue {
++      u32 read;
++      u32 write;
++};
++
++
++/*
++ * The aflags in the following structure is used as follows:
++ *
++ *  - each bit is set when one of the corresponding variables is
++ *  changed by either end.
++ *
++ *  - the end that has made the change then forwards an IRQ to the
++ *  other
++ *
++ *  - the IRQ handler deals with these bits either on the fast path, or
++ *  for less common changes, by jumping onto the slow path.
++ *
++ *  - once it has seen a change, it clears the relevant bit.
++ *
++ * aflags is accessed atomically using clear_bit, test_bit,
++ * test_and_set_bit etc
++ */
++
++/*
++ * The following used to signify to the other domain when the queue
++ * they want to use is full, and when it is no longer full.  Could be
++ * compressed to use fewer bits but done this way for simplicity and
++ * clarity
++ */
++
++/* "dom0->domU queue" is full */
++#define NET_ACCEL_MSG_AFLAGS_QUEUE0FULL      0x1 
++#define NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B    0
++/* "dom0->domU queue" is not full */
++#define NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL   0x2 
++#define NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B 1
++/* "domU->dom0 queue" is full */
++#define NET_ACCEL_MSG_AFLAGS_QUEUEUFULL      0x4 
++#define NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B    2
++/* "domU->dom0 queue" is not full */
++#define NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL   0x8
++#define NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B 3
++/* dom0 -> domU net_dev up/down events */
++#define NET_ACCEL_MSG_AFLAGS_NETUPDOWN         0x10
++#define NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B       4
++
++/*
++ * Masks used to test if there are any messages for domU and dom0
++ * respectively
++ */
++#define NET_ACCEL_MSG_AFLAGS_TO_DOMU_MASK     \
++      (NET_ACCEL_MSG_AFLAGS_QUEUE0FULL    |   \
++       NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL |   \
++       NET_ACCEL_MSG_AFLAGS_NETUPDOWN)
++#define NET_ACCEL_MSG_AFLAGS_TO_DOM0_MASK     \
++      (NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL |   \
++       NET_ACCEL_MSG_AFLAGS_QUEUEUFULL)
++
++/*! The shared data structure used for inter-VM communication. */
++struct net_accel_shared_page {
++      /*! Sanity check */
++      u32 magic;          
++      /*! Used by host/Dom0 */
++      struct net_accel_msg_queue queue0;
++      /*! Used by guest/DomU */
++      struct net_accel_msg_queue queue1;
++      /*! Atomic flags, used to communicate simple state changes */
++      u32 aflags;     
++      /*! State of net_dev used for acceleration */     
++      u32 net_dev_up; 
++};
++
++
++enum net_accel_hw_type {
++      /*! Not a virtualisable NIC: use slow path. */
++      NET_ACCEL_MSG_HWTYPE_NONE = 0,
++      /*! NIC is Falcon-based */
++      NET_ACCEL_MSG_HWTYPE_FALCON_A = 1,
++      NET_ACCEL_MSG_HWTYPE_FALCON_B = 2,
++      NET_ACCEL_MSG_HWTYPE_SIENA_A = 3,
++};
++
++/*! The maximum number of pages used by an event queue. */
++#define EF_HW_FALCON_EVQ_PAGES 8
++
++struct net_accel_hw_falcon_b {
++      /* VI */
++      /*! Grant for Tx DMA Q */
++      u32 txdmaq_gnt;   
++      /*! Grant for Rx DMA Q */
++      u32 rxdmaq_gnt;   
++      /*! Machine frame number for Tx/Rx doorbell page */
++      u32 doorbell_mfn; 
++      /*! Grant for Tx/Rx doorbell page */
++      u32 doorbell_gnt;
++
++      /* Event Q */
++      /*! Grants for the pages of the EVQ */
++      u32 evq_mem_gnts[EF_HW_FALCON_EVQ_PAGES]; 
++      u32 evq_offs;
++      /*! log2(pages in event Q) */
++      u32 evq_order;    
++      /*! Capacity in events */
++      u32 evq_capacity; 
++      /*! Eventq pointer register physical address */
++      u32 evq_rptr; 
++      /*! Interface instance */
++      u32 instance; 
++      /*! Capacity of RX queue */
++      u32 rx_capacity;
++      /*! Capacity of TX queue */
++      u32 tx_capacity;
++
++      /* NIC */
++      s32 nic_arch;
++      s32 nic_revision;
++      u8 nic_variant;
++};
++
++struct net_accel_hw_falcon_a {
++      struct net_accel_hw_falcon_b common;
++      u32 evq_rptr_gnt;
++};
++
++
++/*! Description of the hardware that the DomU is being given. */
++struct net_accel_msg_hw {
++      u32 type;               /*!< Hardware type */
++      union {
++              struct net_accel_hw_falcon_a falcon_a;
++              struct net_accel_hw_falcon_b falcon_b;
++      } resources;
++};
++
++/*! Start-of-day handshake message. Dom0 fills in its version and
++ * sends, DomU checks, inserts its version and replies
++ */
++struct net_accel_msg_hello {
++      /*! Sender's version (set by each side in turn) */
++      u32 version;    
++      /*! max pages allocated/allowed for buffers */
++      u32 max_pages;      
++};
++
++/*! Maximum number of page requests that can fit in a message. */
++#define NET_ACCEL_MSG_MAX_PAGE_REQ (8)
++
++/*! Request for NIC buffers. DomU fils out pages and grants (and
++ *  optionally) reqid, dom0 fills out buf and sends reply 
++ */
++struct net_accel_msg_map_buffers {
++      u32 reqid;      /*!< Optional request ID */
++      u32 pages;      /*!< Number of pages to map */
++      u32 grants[NET_ACCEL_MSG_MAX_PAGE_REQ];  /*!< Grant ids to map */ 
++      u32 buf;          /*!< NIC buffer address of pages obtained */
++};
++
++/*! Notification of a change to local mac address, used to filter
++  locally destined packets off the fast path */
++struct net_accel_msg_localmac {
++      u32 flags;      /*!< Should this be added or removed? */
++      u8 mac[ETH_ALEN]; /*!< The mac address to filter onto slow path */
++};
++
++struct net_accel_msg_fastpath {
++      u32 flags;      /*!< Should this be added or removed? */
++      u8  mac[ETH_ALEN];/*!< The mac address to filter onto fast path */
++      u16 port;        /*!< The port of the connection */
++      u32 ip;    /*!< The IP address of the connection */
++      u8  proto;      /*!< The protocol of connection (TCP/UDP) */
++};
++
++/*! Values for struct ef_msg_localmac/fastpath.flags */
++#define NET_ACCEL_MSG_ADD    0x1
++#define NET_ACCEL_MSG_REMOVE 0x2
++
++/*! Overall message structure */
++struct net_accel_msg {
++      /*! ID specifying type of messge */
++      u32 id;              
++      union {
++              /*! handshake */
++              struct net_accel_msg_hello hello;  
++              /*! hardware description */
++              struct net_accel_msg_hw hw;     
++              /*! buffer map request */
++              struct net_accel_msg_map_buffers mapbufs; 
++              /*! mac address of a local interface */
++              struct net_accel_msg_localmac localmac; 
++              /*! address of a new fastpath connection */
++              struct net_accel_msg_fastpath fastpath; 
++              /*! make the message a fixed size */
++              u8 pad[128 - sizeof(u32)]; 
++      }  u;
++};
++
++
++#define NET_ACCEL_MSG_HW_TO_MSG(_u) container_of(_u, struct net_accel_msg, u.hw)
++
++/*! Inter-domain message FIFO */
++typedef struct {
++      struct net_accel_msg *fifo;
++      u32 fifo_mask;
++      u32 *fifo_rd_i;
++      u32 *fifo_wr_i;
++      spinlock_t lock;
++      u32 is_locked; /* Debug flag */
++} sh_msg_fifo2;
++
++
++#define NET_ACCEL_MSG_OFFSET_MASK PAGE_MASK
++
++/* Modifiers */
++#define NET_ACCEL_MSG_REPLY    (0x80000000)
++#define NET_ACCEL_MSG_ERROR    (0x40000000)
++
++/* Dom0 -> DomU and reply. Handshake/version check. */
++#define NET_ACCEL_MSG_HELLO    (0x00000001)
++/* Dom0 -> DomU : hardware setup (VI info.) */
++#define NET_ACCEL_MSG_SETHW    (0x00000002)
++/*
++ * Dom0 -> DomU. Notification of a local mac to add/remove from slow
++ * path filter
++ */
++#define NET_ACCEL_MSG_LOCALMAC (0x00000003)
++/* 
++ * DomU -> Dom0 and reply. Request for buffer table entries for
++ * preallocated pages.
++ */
++#define NET_ACCEL_MSG_MAPBUF   (0x00000004)
++/* 
++ * Dom0 -> DomU. Notification of a local mac to add/remove from fast
++ * path filter
++ */
++#define NET_ACCEL_MSG_FASTPATH (0x00000005)
++
++/*! Initialise a message and set the type
++ * \param message : the message
++ * \param code : the message type 
++ */
++static inline void net_accel_msg_init(struct net_accel_msg *msg, int code) {
++      msg->id = (u32)code;
++}
++
++/*! initialise a shared page structure
++ * \param shared_page : mapped memory in which the structure resides
++ * \param len : size of the message FIFO area that follows
++ * \param up : initial up/down state of netdev 
++ * \return 0 or an error code
++ */
++extern int net_accel_msg_init_page(void *shared_page, int len, int up);
++
++/*! initialise a message queue 
++ * \param queue : the message FIFO to initialise 
++ * \param indices : the read and write indices in shared memory
++ * \param base : the start of the memory area for the FIFO
++ * \param size : the size of the FIFO in bytes
++ */
++extern void net_accel_msg_init_queue(sh_msg_fifo2 *queue,
++                                   struct net_accel_msg_queue *indices,
++                                   struct net_accel_msg *base, int size);
++
++/* Notify after a batch of messages have been sent */
++extern void net_accel_msg_notify(int irq);
++
++/*! Send a message on the specified FIFO. The message is copied to the 
++ *  current slot of the FIFO.
++ * \param sp : pointer to shared page
++ * \param q : pointer to message FIFO to use
++ * \param msg : pointer to message 
++ * \return 0 on success, -errno on
++ */ 
++extern int net_accel_msg_send(struct net_accel_shared_page *sp,
++                            sh_msg_fifo2 *q, 
++                            struct net_accel_msg *msg);
++extern int net_accel_msg_reply(struct net_accel_shared_page *sp,
++                            sh_msg_fifo2 *q, 
++                            struct net_accel_msg *msg);
++
++/*! As net_accel_msg_send but also posts a notification to the far end. */
++extern int net_accel_msg_send_notify(struct net_accel_shared_page *sp, 
++                                   int irq, sh_msg_fifo2 *q, 
++                                   struct net_accel_msg *msg);
++/*! As net_accel_msg_send but also posts a notification to the far end. */
++extern int net_accel_msg_reply_notify(struct net_accel_shared_page *sp, 
++                                    int irq, sh_msg_fifo2 *q, 
++                                    struct net_accel_msg *msg);
++
++/*! Receive a message on the specified FIFO. Returns 0 on success,
++ *  -errno on failure.
++ */
++extern int net_accel_msg_recv(struct net_accel_shared_page *sp,
++                            sh_msg_fifo2 *q,
++                            struct net_accel_msg *msg);
++
++/*! Look at a received message, if any, so a decision can be made
++ *  about whether to read it now or not.  Cookie is a bit of debug
++ *  which is set here and checked when passed to
++ *  net_accel_msg_recv_next()
++ */
++extern int net_accel_msg_peek(struct net_accel_shared_page *sp,
++                            sh_msg_fifo2 *queue, 
++                            struct net_accel_msg *msg, int *cookie);
++/*! Move the queue onto the next element, used after finished with a
++ *  peeked msg 
++ */
++extern int net_accel_msg_recv_next(struct net_accel_shared_page *sp,
++                                 sh_msg_fifo2 *queue, int cookie);
++
++/*! Start sending a message without copying. returns a pointer to a
++ *  message that will be filled out in place. The queue is locked
++ *  until the message is sent.
++ */
++extern 
++struct net_accel_msg *net_accel_msg_start_send(struct net_accel_shared_page *sp,
++                                             sh_msg_fifo2 *queue,
++                                             unsigned long *flags);
++
++
++/*! Complete the sending of a message started with
++ *  net_accel_msg_start_send. The message is implicit since the queue
++ *  was locked by _start 
++ */
++extern void net_accel_msg_complete_send(struct net_accel_shared_page *sp,
++                                      sh_msg_fifo2 *queue,
++                                      unsigned long *flags);
++
++/*! As net_accel_msg_complete_send but does the notify. */
++extern void net_accel_msg_complete_send_notify(struct net_accel_shared_page *sp, 
++                                             sh_msg_fifo2 *queue,
++                                             unsigned long *flags, int irq);
++
++/*! Lock the queue so that multiple "_locked" functions can be called
++ *  without the queue being modified by others 
++ */
++static inline
++void net_accel_msg_lock_queue(sh_msg_fifo2 *queue, unsigned long *flags)
++{
++      spin_lock_irqsave(&queue->lock, (*flags));
++      rmb();
++      BUG_ON(queue->is_locked);
++      queue->is_locked = 1;
++}
++
++/*! Unlock the queue */
++static inline
++void net_accel_msg_unlock_queue(sh_msg_fifo2 *queue, unsigned long *flags)
++{
++      BUG_ON(!queue->is_locked);
++      queue->is_locked = 0;
++      wmb();
++      spin_unlock_irqrestore(&queue->lock, (*flags));
++}
++
++/*! Give up without sending a message that was started with
++ *  net_accel_msg_start_send() 
++ */
++static inline 
++void net_accel_msg_abort_send(struct net_accel_shared_page *sp,
++                            sh_msg_fifo2 *queue, unsigned long *flags)
++{
++      net_accel_msg_unlock_queue(queue, flags);
++}
++
++/*! Test the queue to ensure there is sufficient space */
++static inline
++int net_accel_msg_check_space(sh_msg_fifo2 *queue, unsigned space)
++{
++      return sh_fifo2_space(queue) >= space;
++}
++
++#endif /* NET_ACCEL_MSG_IFACE_H */
diff --cc drivers/xen/sfc_netutil/accel_shared_fifo.h

index 0000000,0000000..a55608a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_shared_fifo.h
@@@ -1,0 -1,0 +1,127 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#ifndef NET_ACCEL_SHARED_FIFO_H
++#define NET_ACCEL_SHARED_FIFO_H
++
++/*
++ * This is based on fifo.h, but handles sharing between address spaces
++ * that don't trust each other, by splitting out the read and write
++ * indices. This costs at least one pointer indirection more than the
++ * vanilla version per access.
++ */
++
++typedef struct {
++      char*    fifo;
++      unsigned      fifo_mask;
++      unsigned      *fifo_rd_i;
++      unsigned      *fifo_wr_i;
++} sh_byte_fifo2;
++
++#define SH_FIFO2_M(f, x)     ((x) & ((f)->fifo_mask))
++
++static inline unsigned log2_ge(unsigned long n, unsigned min_order) {
++      unsigned order = min_order;
++      while((1ul << order) < n) ++order;
++      return order;
++}
++
++static inline unsigned long pow2(unsigned order) {
++      return (1ul << order);
++}
++
++#define is_pow2(x)  (pow2(log2_ge((x), 0)) == (x))
++
++#define sh_fifo2_valid(f)  ((f) && (f)->fifo && (f)->fifo_mask > 0 &&   \
++                          is_pow2((f)->fifo_mask+1u))
++
++#define sh_fifo2_init(f, cap, _rptr, _wptr)           \
++      do {                                            \
++              BUG_ON(!is_pow2((cap) + 1));            \
++              (f)->fifo_rd_i = _rptr;                 \
++              (f)->fifo_wr_i = _wptr;                 \
++              *(f)->fifo_rd_i = *(f)->fifo_wr_i = 0u; \
++              (f)->fifo_mask = (cap);                 \
++      } while(0)
++
++#define sh_fifo2_num(f)      SH_FIFO2_M((f),*(f)->fifo_wr_i - *(f)->fifo_rd_i)
++#define sh_fifo2_space(f)    SH_FIFO2_M((f),*(f)->fifo_rd_i - *(f)->fifo_wr_i-1u)
++#define sh_fifo2_is_empty(f)  (sh_fifo2_num(f)==0)
++#define sh_fifo2_not_empty(f) (sh_fifo2_num(f)!=0)
++#define sh_fifo2_is_full(f)   (sh_fifo2_space(f)==0u)
++#define sh_fifo2_not_full(f)  (sh_fifo2_space(f)!=0u)
++#define sh_fifo2_buf_size(f) ((f)->fifo_mask + 1u)
++#define sh_fifo2_capacity(f) ((f)->fifo_mask)
++#define sh_fifo2_end(f)      ((f)->fifo + sh_fifo2_buf_size(f))
++#define sh_fifo2_not_half_full(f) (sh_fifo2_space(f) > (sh_fifo2_capacity(f) >> 1))
++
++#define sh_fifo2_peek(f)     ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_rd_i)])
++#define sh_fifo2_peekp(f)    ((f)->fifo + SH_FIFO2_M((f), *(f)->fifo_rd_i))
++#define sh_fifo2_poke(f)     ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_wr_i)])
++#define sh_fifo2_pokep(f)    ((f)->fifo + SH_FIFO2_M((f), *(f)->fifo_wr_i))
++#define sh_fifo2_peek_i(f,i) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_rd_i+(i))])
++#define sh_fifo2_poke_i(f,i) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_wr_i+(i))])
++
++#define sh_fifo2_rd_next(f)                                   \
++      do {*(f)->fifo_rd_i = *(f)->fifo_rd_i + 1u;} while(0)
++#define sh_fifo2_wr_next(f)                                   \
++      do {*(f)->fifo_wr_i = *(f)->fifo_wr_i + 1u;} while(0)
++#define sh_fifo2_rd_adv(f, n)                                 \
++      do {*(f)->fifo_rd_i = *(f)->fifo_rd_i + (n);} while(0)
++#define sh_fifo2_wr_adv(f, n)                                 \
++      do {*(f)->fifo_wr_i = *(f)->fifo_wr_i + (n);} while(0)
++
++#define sh_fifo2_put(f, v)                                            \
++      do {sh_fifo2_poke(f) = (v); wmb(); sh_fifo2_wr_next(f);} while(0)
++
++#define sh_fifo2_get(f, pv)                                           \
++      do {*(pv) = sh_fifo2_peek(f); mb(); sh_fifo2_rd_next(f);} while(0)
++
++static inline unsigned sh_fifo2_contig_num(sh_byte_fifo2 *f)
++{
++      unsigned fifo_wr_i = SH_FIFO2_M(f, *f->fifo_wr_i);
++      unsigned fifo_rd_i = SH_FIFO2_M(f, *f->fifo_rd_i);
++
++      return (fifo_wr_i >= fifo_rd_i)
++              ? fifo_wr_i - fifo_rd_i
++              : f->fifo_mask + 1u - *(f)->fifo_rd_i;
++}
++
++static inline unsigned sh_fifo2_contig_space(sh_byte_fifo2 *f)
++{
++      unsigned fifo_wr_i = SH_FIFO2_M(f, *f->fifo_wr_i);
++      unsigned fifo_rd_i = SH_FIFO2_M(f, *f->fifo_rd_i);
++
++      return (fifo_rd_i > fifo_wr_i)
++              ? fifo_rd_i - fifo_wr_i - 1
++              : (f->fifo_mask + 1u - fifo_wr_i
++                 /*
++                  * The last byte can't be used if the read pointer
++                  * is at zero.
++                  */
++                 - (fifo_rd_i==0));
++}
++
++
++#endif /* NET_ACCEL_SHARED_FIFO_H */
diff --cc drivers/xen/sfc_netutil/accel_util.c

index 0000000,0000000..d66609b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_util.c
@@@ -1,0 -1,0 +1,336 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#include <linux/slab.h>
++#include <linux/if_ether.h>
++#include <linux/delay.h>
++#include <asm/io.h>
++#include <asm/pgtable.h>
++#include <asm/hypercall.h>
++#include <xen/xenbus.h>
++#include <xen/gnttab.h>
++
++#include "accel_util.h"
++
++#ifdef EFX_GCOV
++#include "gcov.h"
++
++static int __init net_accel_init(void)
++{
++      gcov_provider_init(THIS_MODULE);
++      return 0;
++}
++module_init(net_accel_init);
++
++static void __exit net_accel_exit(void)
++{
++      gcov_provider_fini(THIS_MODULE);
++}
++module_exit(net_accel_exit);
++#endif
++
++/* Shutdown remote domain that is misbehaving */
++int net_accel_shutdown_remote(int domain)
++{
++      struct sched_remote_shutdown sched_shutdown = {
++              .domain_id = domain,
++              .reason = SHUTDOWN_crash
++      };
++
++      EPRINTK("Crashing domain %d\n", domain);
++
++      return HYPERVISOR_sched_op(SCHEDOP_remote_shutdown, &sched_shutdown);
++}
++EXPORT_SYMBOL(net_accel_shutdown_remote);
++
++
++/* Based on xenbus_backend_client.c:xenbus_map_ring() */
++static int net_accel_map_grant(struct xenbus_device *dev, int gnt_ref,
++                             grant_handle_t *handle, void *vaddr, 
++                             u64 *dev_bus_addr, unsigned flags)
++{
++      struct gnttab_map_grant_ref op;
++      int ret;
++      
++      gnttab_set_map_op(&op, (unsigned long)vaddr, flags,
++                        gnt_ref, dev->otherend_id);
++
++      gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &op);
++
++      if (op.status != GNTST_okay) {
++              xenbus_dev_error
++                      (dev, op.status,
++                       "failed mapping in shared page %d from domain %d\n",
++                       gnt_ref, dev->otherend_id);
++              ret = -EINVAL;
++      } else {
++              *handle = op.handle;
++              if (dev_bus_addr)
++                      *dev_bus_addr = op.dev_bus_addr;
++              ret = 0;
++      }
++
++      return ret;
++}
++
++
++/* Based on xenbus_backend_client.c:xenbus_unmap_ring() */
++static int net_accel_unmap_grant(struct xenbus_device *dev, 
++                               grant_handle_t handle,
++                               void *vaddr, u64 dev_bus_addr,
++                               unsigned flags)
++{
++      struct gnttab_unmap_grant_ref op;
++
++      gnttab_set_unmap_op(&op, (unsigned long)vaddr, flags, handle);
++      
++      if (dev_bus_addr)
++              op.dev_bus_addr = dev_bus_addr;
++
++      BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
++
++      if (op.status != GNTST_okay)
++              xenbus_dev_error(dev, op.status,
++                               "failed unmapping page at handle %d error %d\n",
++                               handle, op.status);
++
++      return op.status == GNTST_okay ? 0 : -EINVAL;
++}
++
++
++int net_accel_map_device_page(struct xenbus_device *dev,  
++                            int gnt_ref, grant_handle_t *handle,
++                            u64 *dev_bus_addr)
++{
++      return net_accel_map_grant(dev, gnt_ref, handle, 0, dev_bus_addr,
++                                 GNTMAP_device_map);
++}
++EXPORT_SYMBOL_GPL(net_accel_map_device_page);
++
++ 
++int net_accel_unmap_device_page(struct xenbus_device *dev,
++                              grant_handle_t handle, u64 dev_bus_addr)
++{
++      return net_accel_unmap_grant(dev, handle, 0, dev_bus_addr, 
++                                   GNTMAP_device_map);
++}
++EXPORT_SYMBOL_GPL(net_accel_unmap_device_page);
++
++
++struct net_accel_valloc_grant_mapping {
++      struct vm_struct *vm;
++      int pages;
++      grant_handle_t grant_handles[0];
++};
++
++/* Map a series of grants into a contiguous virtual area */
++static void *net_accel_map_grants_valloc(struct xenbus_device *dev, 
++                                       unsigned *grants, int npages, 
++                                       unsigned flags, void **priv)
++{
++      struct net_accel_valloc_grant_mapping *map;
++      struct vm_struct *vm;
++      void *addr;
++      int i, j, rc;
++
++      vm  = alloc_vm_area(PAGE_SIZE * npages);
++      if (vm == NULL) {
++              EPRINTK("No memory from alloc_vm_area.\n");
++              return NULL;
++      }
++      /* 
++       * Get a structure in which we will record all the info needed
++       * to undo the mapping.
++       */
++      map = kzalloc(sizeof(struct net_accel_valloc_grant_mapping)  + 
++                    npages * sizeof(grant_handle_t), GFP_KERNEL);
++      if (map == NULL) {
++              EPRINTK("No memory for net_accel_valloc_grant_mapping\n");
++              free_vm_area(vm);
++              return NULL;
++      }
++      map->vm = vm;
++      map->pages = npages;
++
++      /* Do the actual mapping */
++      addr = vm->addr;
++
++      for (i = 0; i < npages; i++) {
++              rc = net_accel_map_grant(dev, grants[i], map->grant_handles + i, 
++                                       addr, NULL, flags);
++              if (rc < 0)
++                      goto undo;
++              addr = (void*)((unsigned long)addr + PAGE_SIZE);
++      }
++
++      if (priv)
++              *priv = (void *)map;
++      else
++              kfree(map);
++
++      return vm->addr;
++
++ undo:
++      EPRINTK("Aborting contig map due to single map failure %d (%d of %d)\n",
++              rc, i+1, npages);
++      for (j = 0; j < i; j++) {
++              addr = (void*)((unsigned long)vm->addr + (j * PAGE_SIZE));
++              net_accel_unmap_grant(dev, map->grant_handles[j], addr, 0,
++                                    flags);
++      }
++      free_vm_area(vm);
++      kfree(map);
++      return NULL;
++}
++
++/* Undo the result of the mapping */
++static void net_accel_unmap_grants_vfree(struct xenbus_device *dev, 
++                                       unsigned flags, void *priv)
++{
++      struct net_accel_valloc_grant_mapping *map = 
++              (struct net_accel_valloc_grant_mapping *)priv;
++
++      void *addr = map->vm->addr;
++      int npages = map->pages;
++      int i;
++
++      for (i = 0; i < npages; i++) {
++              net_accel_unmap_grant(dev, map->grant_handles[i], addr, 0,
++                                    flags);
++              addr = (void*)((unsigned long)addr + PAGE_SIZE);
++      }
++      free_vm_area(map->vm);
++      kfree(map);
++}
++
++
++void *net_accel_map_grants_contig(struct xenbus_device *dev,
++                              unsigned *grants, int npages, 
++                              void **priv)
++{
++    return net_accel_map_grants_valloc(dev, grants, npages, GNTMAP_host_map, priv);
++}
++EXPORT_SYMBOL(net_accel_map_grants_contig);
++
++
++void net_accel_unmap_grants_contig(struct xenbus_device *dev,
++                                 void *priv)
++{
++      net_accel_unmap_grants_vfree(dev, GNTMAP_host_map, priv);
++}
++EXPORT_SYMBOL(net_accel_unmap_grants_contig);
++
++
++void *net_accel_map_iomem_page(struct xenbus_device *dev, int gnt_ref,
++                           void **priv)
++{
++      return net_accel_map_grants_valloc(dev, &gnt_ref, 1, GNTMAP_host_map, priv);
++}
++EXPORT_SYMBOL(net_accel_map_iomem_page);
++
++
++void net_accel_unmap_iomem_page(struct xenbus_device *dev, void *priv)
++{
++      net_accel_unmap_grants_vfree(dev, GNTMAP_host_map, priv);
++}
++EXPORT_SYMBOL(net_accel_unmap_iomem_page);
++
++
++int net_accel_grant_page(struct xenbus_device *dev, unsigned long mfn, 
++                       int is_iomem)
++{
++      int err = gnttab_grant_foreign_access(dev->otherend_id, mfn,
++                                            is_iomem ? GTF_PCD : 0);
++      if (err < 0)
++              xenbus_dev_error(dev, err, "failed granting access to page\n");
++      return err;
++}
++EXPORT_SYMBOL_GPL(net_accel_grant_page);
++
++
++int net_accel_ungrant_page(grant_ref_t gntref)
++{
++      if (unlikely(gnttab_query_foreign_access(gntref) != 0)) {
++              EPRINTK("%s: remote domain still using grant %d\n", __FUNCTION__, 
++                      gntref);
++              return -EBUSY;
++      }
++
++      gnttab_end_foreign_access(gntref, 0);
++      return 0;
++}
++EXPORT_SYMBOL_GPL(net_accel_ungrant_page);
++
++
++int net_accel_xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
++{
++      char *s, *e, *macstr;
++      int i;
++
++      macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
++      if (IS_ERR(macstr))
++              return PTR_ERR(macstr);
++
++      for (i = 0; i < ETH_ALEN; i++) {
++              mac[i] = simple_strtoul(s, &e, 16);
++              if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
++                      kfree(macstr);
++                      return -ENOENT;
++              }
++              s = e+1;
++      }
++
++      kfree(macstr);
++      return 0;
++}
++EXPORT_SYMBOL_GPL(net_accel_xen_net_read_mac);
++
++
++void net_accel_update_state(struct xenbus_device *dev, int state)
++{
++      struct xenbus_transaction tr;
++      int err;
++
++      DPRINTK("%s: setting accelstate to %s\n", __FUNCTION__,
++              xenbus_strstate(state));
++
++      if (xenbus_exists(XBT_NIL, dev->nodename, "")) {
++              VPRINTK("%s: nodename %s\n", __FUNCTION__, dev->nodename);
++      again:
++              err = xenbus_transaction_start(&tr);
++              if (err == 0)
++                      err = xenbus_printf(tr, dev->nodename, "accelstate",
++                                          "%d", state);
++              if (err != 0) {
++                      xenbus_transaction_end(tr, 1);
++              } else {
++                      err = xenbus_transaction_end(tr, 0);
++                      if (err == -EAGAIN)
++                              goto again;
++              }
++      }
++}
++EXPORT_SYMBOL_GPL(net_accel_update_state);
++
++MODULE_LICENSE("GPL");
diff --cc drivers/xen/sfc_netutil/accel_util.h

index 0000000,0000000..66f96d8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_util.h
@@@ -1,0 -1,0 +1,124 @@@
++/****************************************************************************
++ * Solarflare driver for Xen network acceleration
++ *
++ * Copyright 2006-2008: Solarflare Communications Inc,
++ *                      9501 Jeronimo Road, Suite 250,
++ *                      Irvine, CA 92618, USA
++ *
++ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published
++ * by the Free Software Foundation, incorporated herein by reference.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
++ ****************************************************************************
++ */
++
++#ifndef NETBACK_ACCEL_UTIL_H
++#define NETBACK_ACCEL_UTIL_H
++
++#ifdef DPRINTK
++#undef DPRINTK
++#endif
++
++#define FILE_LEAF strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__
++
++#if 1
++#define VPRINTK(_f, _a...) 
++#else
++#define VPRINTK(_f, _a...)                    \
++      printk("(file=%s, line=%d) " _f,        \
++             FILE_LEAF , __LINE__ , ## _a )
++#endif
++
++#if 1
++#define DPRINTK(_f, _a...) 
++#else
++#define DPRINTK(_f, _a...)                    \
++      printk("(file=%s, line=%d) " _f,        \
++             FILE_LEAF , __LINE__ , ## _a )
++#endif
++
++#define EPRINTK(_f, _a...)                    \
++      printk("(file=%s, line=%d) " _f,        \
++             FILE_LEAF , __LINE__ , ## _a )
++
++#define EPRINTK_ON(exp)                                                       \
++      do {                                                            \
++              if (exp)                                                \
++                      EPRINTK("%s at %s:%d\n", #exp, __FILE__, __LINE__); \
++      } while(0)
++
++#define DPRINTK_ON(exp)                                                       \
++      do {                                                            \
++              if (exp)                                                \
++                      DPRINTK("%s at %s:%d\n", #exp, __FILE__, __LINE__); \
++      } while(0)
++
++#include <xen/xenbus.h>
++
++/*! Map a set of pages from another domain
++ * \param dev The xenbus device context
++ * \param priv The private data returned by the mapping function 
++ */
++extern 
++void *net_accel_map_grants_contig(struct xenbus_device *dev, 
++                                unsigned *grants, int npages, 
++                                void **priv);
++
++/*! Unmap a set of pages mapped using net_accel_map_grants_contig.
++ * \param dev The xenbus device context
++ * \param priv The private data returned by the mapping function 
++ */
++extern 
++void net_accel_unmap_grants_contig(struct xenbus_device *dev, void *priv);
++
++/*! Read the MAC address of a device from xenstore */
++extern
++int net_accel_xen_net_read_mac(struct xenbus_device *dev, u8 mac[]);
++
++/*! Update the accelstate field for a device in xenstore */
++extern
++void net_accel_update_state(struct xenbus_device *dev, int state);
++
++/* These four map/unmap functions are based on
++ * xenbus_backend_client.c:xenbus_map_ring().  However, they are not
++ * used for ring buffers, instead just to map pages between domains,
++ * or to map a page so that it is accessible by a device
++ */
++extern
++int net_accel_map_device_page(struct xenbus_device *dev,  
++                            int gnt_ref, grant_handle_t *handle,
++                            u64 *dev_bus_addr);
++extern
++int net_accel_unmap_device_page(struct xenbus_device *dev,
++                              grant_handle_t handle, u64 dev_bus_addr);
++extern
++void *net_accel_map_iomem_page(struct xenbus_device *dev, int gnt_ref,
++                           void **priv);
++extern
++void net_accel_unmap_iomem_page(struct xenbus_device *dev, void *priv);
++
++/*! Grrant a page to remote domain */
++extern
++int net_accel_grant_page(struct xenbus_device *dev, unsigned long mfn, 
++                       int is_iomem);
++/*! Undo a net_accel_grant_page */
++extern
++int net_accel_ungrant_page(grant_ref_t gntref);
++
++
++/*! Shutdown remote domain that is misbehaving */
++extern
++int net_accel_shutdown_remote(int domain);
++
++
++#endif
diff --cc drivers/xen/sys-hypervisor.c

index 1e0fe01,60f1827..af529fe
--- 1/drivers/xen/sys-hypervisor.c
--- 2/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@@ -20,6 -20,6 +20,8 @@@
   #include <xen/interface/xen.h>
   #include <xen/interface/version.h>
   
++#include "xenbus/xenbus_comms.h"
++
   #define HYPERVISOR_ATTR_RO(_name) \
   static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
   
@@@ -118,9 -118,9 +120,8 @@@ static ssize_t uuid_show(struct hyp_sys
   {
         char *vm, *val;
         int ret;
--      extern int xenstored_ready;
   
--      if (!xenstored_ready)
++      if (!is_xenstored_ready())
                 return -EBUSY;
   
         vm = xenbus_read(XBT_NIL, "vm", "", NULL);
diff --cc drivers/xen/tpmback/Makefile

index 0000000,0000000..d5865c4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/tpmback/Makefile
@@@ -1,0 -1,0 +1,4 @@@
++
++obj-$(CONFIG_XEN_TPMDEV_BACKEND)      += tpmbk.o
++
++tpmbk-y += tpmback.o interface.o xenbus.o
diff --cc drivers/xen/tpmback/common.h

index 0000000,0000000..2ab8711

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/tpmback/common.h
@@@ -1,0 -1,0 +1,94 @@@
++/******************************************************************************
++ * drivers/xen/tpmback/common.h
++ */
++
++#ifndef __TPM__BACKEND__COMMON_H__
++#define __TPM__BACKEND__COMMON_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/mm.h>
++#include <linux/slab.h>
++#include <xen/xenbus.h>
++#include <xen/interface/event_channel.h>
++#include <xen/interface/io/tpmif.h>
++
++#define DPRINTK(_f, _a...)                    \
++      pr_debug("(file=%s, line=%d) " _f,      \
++               __FILE__ , __LINE__ , ## _a )
++
++struct backend_info
++{
++      struct xenbus_device *dev;
++
++      /* our communications channel */
++      struct tpmif_st *tpmif;
++
++      long int frontend_id;
++      long int instance; // instance of TPM
++      u8 is_instance_set;// whether instance number has been set
++
++      /* watch front end for changes */
++      struct xenbus_watch backend_watch;
++};
++
++typedef struct tpmif_st {
++      struct list_head tpmif_list;
++      /* Unique identifier for this interface. */
++      domid_t domid;
++      unsigned int handle;
++
++      /* Physical parameters of the comms window. */
++      unsigned int irq;
++
++      /* The shared rings and indexes. */
++      tpmif_tx_interface_t *tx;
++      struct vm_struct *tx_area;
++
++      /* Miscellaneous private stuff. */
++      enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
++      int active;
++
++      struct tpmif_st *hash_next;
++      struct list_head list;  /* scheduling list */
++      atomic_t refcnt;
++
++      struct backend_info *bi;
++
++      struct page **mmap_pages;
++
++      char devname[20];
++} tpmif_t;
++
++void tpmif_disconnect_complete(tpmif_t * tpmif);
++tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi);
++int tpmif_interface_init(void);
++void tpmif_interface_exit(void);
++void tpmif_schedule_work(tpmif_t * tpmif);
++void tpmif_deschedule_work(tpmif_t * tpmif);
++int tpmif_xenbus_init(void);
++void tpmif_xenbus_exit(void);
++int tpmif_map(tpmif_t *, grant_ref_t, evtchn_port_t);
++irqreturn_t tpmif_be_int(int irq, void *dev_id);
++
++long int tpmback_get_instance(struct backend_info *bi);
++
++int vtpm_release_packets(tpmif_t * tpmif, int send_msgs);
++
++
++#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt))
++#define tpmif_put(_b)                                 \
++      do {                                            \
++              if (atomic_dec_and_test(&(_b)->refcnt)) \
++                      tpmif_disconnect_complete(_b);  \
++      } while (0)
++
++extern int num_frontends;
++
++static inline unsigned long idx_to_kaddr(tpmif_t *t, unsigned int idx)
++{
++      return (unsigned long)pfn_to_kaddr(page_to_pfn(t->mmap_pages[idx]));
++}
++
++#endif /* __TPMIF__BACKEND__COMMON_H__ */
diff --cc drivers/xen/tpmback/interface.c

index 0000000,0000000..37850c8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/tpmback/interface.c
@@@ -1,0 -1,0 +1,133 @@@
++ /*****************************************************************************
++ * drivers/xen/tpmback/interface.c
++ *
++ * Vritual TPM interface management.
++ *
++ * Copyright (c) 2005, IBM Corporation
++ *
++ * Author: Stefan Berger, stefanb@us.ibm.com
++ *
++ * This code has been derived from drivers/xen/netback/interface.c
++ * Copyright (c) 2004, Keir Fraser
++ */
++
++#include "common.h"
++#include <linux/delay.h>
++#include <linux/err.h>
++#include <linux/vmalloc.h>
++#include <xen/balloon.h>
++#include <xen/evtchn.h>
++#include <xen/gnttab.h>
++
++static struct kmem_cache *tpmif_cachep;
++int num_frontends = 0;
++
++LIST_HEAD(tpmif_list);
++
++static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi)
++{
++      tpmif_t *tpmif;
++
++      tpmif = kmem_cache_zalloc(tpmif_cachep, GFP_KERNEL);
++      if (tpmif == NULL)
++              goto out_of_memory;
++
++      tpmif->domid = domid;
++      tpmif->status = DISCONNECTED;
++      tpmif->bi = bi;
++      snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid);
++      atomic_set(&tpmif->refcnt, 1);
++
++      tpmif->mmap_pages = alloc_empty_pages_and_pagevec(TPMIF_TX_RING_SIZE);
++      if (tpmif->mmap_pages == NULL)
++              goto out_of_memory;
++
++      list_add(&tpmif->tpmif_list, &tpmif_list);
++      num_frontends++;
++
++      return tpmif;
++
++ out_of_memory:
++      if (tpmif != NULL)
++              kmem_cache_free(tpmif_cachep, tpmif);
++      pr_err("%s: out of memory\n", __FUNCTION__);
++      return ERR_PTR(-ENOMEM);
++}
++
++static void free_tpmif(tpmif_t * tpmif)
++{
++      num_frontends--;
++      list_del(&tpmif->tpmif_list);
++      free_empty_pages_and_pagevec(tpmif->mmap_pages, TPMIF_TX_RING_SIZE);
++      kmem_cache_free(tpmif_cachep, tpmif);
++}
++
++tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi)
++{
++      tpmif_t *tpmif;
++
++      list_for_each_entry(tpmif, &tpmif_list, tpmif_list) {
++              if (tpmif->bi == bi) {
++                      if (tpmif->domid == domid) {
++                              tpmif_get(tpmif);
++                              return tpmif;
++                      } else {
++                              return ERR_PTR(-EEXIST);
++                      }
++              }
++      }
++
++      return alloc_tpmif(domid, bi);
++}
++
++int tpmif_map(tpmif_t *tpmif, grant_ref_t ring_ref, evtchn_port_t evtchn)
++{
++      struct vm_struct *area;
++      int err;
++
++      if (tpmif->irq)
++              return 0;
++
++      area = xenbus_map_ring_valloc(tpmif->bi->dev, ring_ref);
++      if (IS_ERR(area))
++              return PTR_ERR(area);
++      tpmif->tx_area = area;
++
++      tpmif->tx = (tpmif_tx_interface_t *)area->addr;
++      clear_page(tpmif->tx);
++
++      err = bind_interdomain_evtchn_to_irqhandler(
++              tpmif->domid, evtchn, tpmif_be_int, 0, tpmif->devname, tpmif);
++      if (err < 0) {
++              xenbus_unmap_ring_vfree(tpmif->bi->dev, area);
++              return err;
++      }
++      tpmif->irq = err;
++
++      tpmif->active = 1;
++
++      return 0;
++}
++
++void tpmif_disconnect_complete(tpmif_t *tpmif)
++{
++      if (tpmif->irq)
++              unbind_from_irqhandler(tpmif->irq, tpmif);
++
++      if (tpmif->tx)
++              xenbus_unmap_ring_vfree(tpmif->bi->dev, tpmif->tx_area);
++
++      free_tpmif(tpmif);
++}
++
++int __init tpmif_interface_init(void)
++{
++      tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t),
++                                       0, 0, NULL);
++      return tpmif_cachep ? 0 : -ENOMEM;
++}
++
++void tpmif_interface_exit(void)
++{
++      kmem_cache_destroy(tpmif_cachep);
++}
diff --cc drivers/xen/tpmback/tpmback.c

index 0000000,0000000..ca96eee

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/tpmback/tpmback.c
@@@ -1,0 -1,0 +1,946 @@@
++/******************************************************************************
++ * drivers/xen/tpmback/tpmback.c
++ *
++ * Copyright (c) 2005, IBM Corporation
++ *
++ * Author: Stefan Berger, stefanb@us.ibm.com
++ * Grant table support: Mahadevan Gomathisankaran
++ *
++ * This code has been derived from drivers/xen/netback/netback.c
++ * Copyright (c) 2002-2004, K A Fraser
++ *
++ */
++
++#include "common.h"
++#include <xen/evtchn.h>
++
++#include <linux/types.h>
++#include <linux/list.h>
++#include <linux/miscdevice.h>
++#include <linux/poll.h>
++#include <linux/delay.h>
++#include <asm/uaccess.h>
++#include <xen/xenbus.h>
++#include <xen/interface/grant_table.h>
++#include <xen/gnttab.h>
++
++/* local data structures */
++struct data_exchange {
++      struct list_head pending_pak;
++      struct list_head current_pak;
++      unsigned int copied_so_far;
++      u8 has_opener:1;
++      u8 aborted:1;
++      rwlock_t pak_lock;      // protects all of the previous fields
++      wait_queue_head_t wait_queue;
++};
++
++struct vtpm_resp_hdr {
++      uint32_t instance_no;
++      uint16_t tag_no;
++      uint32_t len_no;
++      uint32_t ordinal_no;
++} __attribute__ ((packed));
++
++struct packet {
++      struct list_head next;
++      unsigned int data_len;
++      u8 *data_buffer;
++      tpmif_t *tpmif;
++      u32 tpm_instance;
++      u8 req_tag;
++      u32 last_read;
++      u8 flags;
++      struct timer_list processing_timer;
++};
++
++enum {
++      PACKET_FLAG_DISCARD_RESPONSE = 1,
++};
++
++/* local variables */
++static struct data_exchange dataex;
++
++/* local function prototypes */
++static int _packet_write(struct packet *pak,
++                       const char *data, size_t size, int userbuffer);
++static void processing_timeout(unsigned long ptr);
++static int packet_read_shmem(struct packet *pak,
++                           tpmif_t * tpmif,
++                           u32 offset,
++                           char *buffer, int isuserbuffer, u32 left);
++static int vtpm_queue_packet(struct packet *pak);
++
++/***************************************************************
++ Buffer copying fo user and kernel space buffes.
++***************************************************************/
++static inline int copy_from_buffer(void *to,
++                                 const void *from, unsigned long size,
++                                 int isuserbuffer)
++{
++      if (isuserbuffer) {
++              if (copy_from_user(to, (void __user *)from, size))
++                      return -EFAULT;
++      } else {
++              memcpy(to, from, size);
++      }
++      return 0;
++}
++
++static inline int copy_to_buffer(void *to,
++                               const void *from, unsigned long size,
++                               int isuserbuffer)
++{
++      if (isuserbuffer) {
++              if (copy_to_user((void __user *)to, from, size))
++                      return -EFAULT;
++      } else {
++              memcpy(to, from, size);
++      }
++      return 0;
++}
++
++
++static void dataex_init(struct data_exchange *dataex)
++{
++      INIT_LIST_HEAD(&dataex->pending_pak);
++      INIT_LIST_HEAD(&dataex->current_pak);
++      dataex->has_opener = 0;
++      rwlock_init(&dataex->pak_lock);
++      init_waitqueue_head(&dataex->wait_queue);
++}
++
++/***************************************************************
++ Packet-related functions
++***************************************************************/
++
++static struct packet *packet_find_instance(struct list_head *head,
++                                         u32 tpm_instance)
++{
++      struct packet *pak;
++      struct list_head *p;
++
++      /*
++       * traverse the list of packets and return the first
++       * one with the given instance number
++       */
++      list_for_each(p, head) {
++              pak = list_entry(p, struct packet, next);
++
++              if (pak->tpm_instance == tpm_instance) {
++                      return pak;
++              }
++      }
++      return NULL;
++}
++
++static struct packet *packet_find_packet(struct list_head *head, void *packet)
++{
++      struct packet *pak;
++      struct list_head *p;
++
++      /*
++       * traverse the list of packets and return the first
++       * one with the given instance number
++       */
++      list_for_each(p, head) {
++              pak = list_entry(p, struct packet, next);
++
++              if (pak == packet) {
++                      return pak;
++              }
++      }
++      return NULL;
++}
++
++static struct packet *packet_alloc(tpmif_t * tpmif,
++                                 u32 size, u8 req_tag, u8 flags)
++{
++      struct packet *pak = NULL;
++      pak = kzalloc(sizeof (struct packet), GFP_ATOMIC);
++      if (NULL != pak) {
++              if (tpmif) {
++                      pak->tpmif = tpmif;
++                      pak->tpm_instance = tpmback_get_instance(tpmif->bi);
++                      tpmif_get(tpmif);
++              }
++              pak->data_len = size;
++              pak->req_tag = req_tag;
++              pak->last_read = 0;
++              pak->flags = flags;
++
++              /*
++               * cannot do tpmif_get(tpmif); bad things happen
++               * on the last tpmif_put()
++               */
++              init_timer(&pak->processing_timer);
++              pak->processing_timer.function = processing_timeout;
++              pak->processing_timer.data = (unsigned long)pak;
++      }
++      return pak;
++}
++
++static void inline packet_reset(struct packet *pak)
++{
++      pak->last_read = 0;
++}
++
++static void packet_free(struct packet *pak)
++{
++      if (timer_pending(&pak->processing_timer)) {
++              BUG();
++      }
++
++      if (pak->tpmif)
++              tpmif_put(pak->tpmif);
++      kfree(pak->data_buffer);
++      /*
++       * cannot do tpmif_put(pak->tpmif); bad things happen
++       * on the last tpmif_put()
++       */
++      kfree(pak);
++}
++
++
++/*
++ * Write data to the shared memory and send it to the FE.
++ */
++static int packet_write(struct packet *pak,
++                      const char *data, size_t size, int isuserbuffer)
++{
++      int rc = 0;
++
++      if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) {
++              /* Don't send a respone to this packet. Just acknowledge it. */
++              rc = size;
++      } else {
++              rc = _packet_write(pak, data, size, isuserbuffer);
++      }
++
++      return rc;
++}
++
++int _packet_write(struct packet *pak,
++                const char *data, size_t size, int isuserbuffer)
++{
++      /*
++       * Write into the shared memory pages directly
++       * and send it to the front end.
++       */
++      tpmif_t *tpmif = pak->tpmif;
++      grant_handle_t handle;
++      int rc = 0;
++      unsigned int i = 0;
++      unsigned int offset = 0;
++
++      if (tpmif == NULL) {
++              return -EFAULT;
++      }
++
++      if (tpmif->status == DISCONNECTED) {
++              return size;
++      }
++
++      while (offset < size && i < TPMIF_TX_RING_SIZE) {
++              unsigned int tocopy;
++              struct gnttab_map_grant_ref map_op;
++              struct gnttab_unmap_grant_ref unmap_op;
++              tpmif_tx_request_t *tx;
++
++              tx = &tpmif->tx->ring[i].req;
++
++              if (0 == tx->addr) {
++                      DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i);
++                      return 0;
++              }
++
++              gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
++                                GNTMAP_host_map, tx->ref, tpmif->domid);
++
++              gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map_op);
++
++              if (map_op.status != GNTST_okay) {
++                      DPRINTK(" Grant table operation failure !\n");
++                      return 0;
++              }
++
++              handle = map_op.handle;
++
++              tocopy = min_t(size_t, size - offset, PAGE_SIZE);
++
++              if (copy_from_buffer((void *)(idx_to_kaddr(tpmif, i) |
++                                            (tx->addr & ~PAGE_MASK)),
++                                   &data[offset], tocopy, isuserbuffer)) {
++                      tpmif_put(tpmif);
++                      return -EFAULT;
++              }
++              tx->size = tocopy;
++
++              gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
++                                  GNTMAP_host_map, handle);
++
++              if (unlikely
++                  (HYPERVISOR_grant_table_op
++                   (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
++                      BUG();
++              }
++
++              offset += tocopy;
++              i++;
++      }
++
++      rc = offset;
++      DPRINTK("Notifying frontend via irq %d\n", tpmif->irq);
++      notify_remote_via_irq(tpmif->irq);
++
++      return rc;
++}
++
++/*
++ * Read data from the shared memory and copy it directly into the
++ * provided buffer. Advance the read_last indicator which tells
++ * how many bytes have already been read.
++ */
++static int packet_read(struct packet *pak, size_t numbytes,
++                     char *buffer, size_t buffersize, int isuserbuffer)
++{
++      tpmif_t *tpmif = pak->tpmif;
++
++      /*
++       * Read 'numbytes' of data from the buffer. The first 4
++       * bytes are the instance number in network byte order,
++       * after that come the data from the shared memory buffer.
++       */
++      u32 to_copy;
++      u32 offset = 0;
++      u32 room_left = buffersize;
++
++      if (pak->last_read < 4) {
++              /*
++               * copy the instance number into the buffer
++               */
++              u32 instance_no = htonl(pak->tpm_instance);
++              u32 last_read = pak->last_read;
++
++              to_copy = min_t(size_t, 4 - last_read, numbytes);
++
++              if (copy_to_buffer(&buffer[0],
++                                 &(((u8 *) & instance_no)[last_read]),
++                                 to_copy, isuserbuffer)) {
++                      return -EFAULT;
++              }
++
++              pak->last_read += to_copy;
++              offset += to_copy;
++              room_left -= to_copy;
++      }
++
++      /*
++       * If the packet has a data buffer appended, read from it...
++       */
++
++      if (room_left > 0) {
++              if (pak->data_buffer) {
++                      u32 to_copy = min_t(u32, pak->data_len - offset, room_left);
++                      u32 last_read = pak->last_read - 4;
++
++                      if (copy_to_buffer(&buffer[offset],
++                                         &pak->data_buffer[last_read],
++                                         to_copy, isuserbuffer)) {
++                              return -EFAULT;
++                      }
++                      pak->last_read += to_copy;
++                      offset += to_copy;
++              } else {
++                      offset = packet_read_shmem(pak,
++                                                 tpmif,
++                                                 offset,
++                                                 buffer,
++                                                 isuserbuffer, room_left);
++              }
++      }
++      return offset;
++}
++
++static int packet_read_shmem(struct packet *pak,
++                           tpmif_t * tpmif,
++                           u32 offset, char *buffer, int isuserbuffer,
++                           u32 room_left)
++{
++      u32 last_read = pak->last_read - 4;
++      u32 i = (last_read / PAGE_SIZE);
++      u32 pg_offset = last_read & (PAGE_SIZE - 1);
++      u32 to_copy;
++      grant_handle_t handle;
++
++      tpmif_tx_request_t *tx;
++
++      tx = &tpmif->tx->ring[0].req;
++      /*
++       * Start copying data at the page with index 'index'
++       * and within that page at offset 'offset'.
++       * Copy a maximum of 'room_left' bytes.
++       */
++      to_copy = min_t(u32, PAGE_SIZE - pg_offset, room_left);
++      while (to_copy > 0) {
++              void *src;
++              struct gnttab_map_grant_ref map_op;
++              struct gnttab_unmap_grant_ref unmap_op;
++
++              tx = &tpmif->tx->ring[i].req;
++
++              gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
++                                GNTMAP_host_map, tx->ref, tpmif->domid);
++
++              gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map_op);
++
++              if (map_op.status != GNTST_okay) {
++                      DPRINTK(" Grant table operation failure !\n");
++                      return -EFAULT;
++              }
++
++              handle = map_op.handle;
++
++              if (to_copy > tx->size) {
++                      /*
++                       * User requests more than what's available
++                       */
++                      to_copy = min_t(u32, tx->size, to_copy);
++              }
++
++              DPRINTK("Copying from mapped memory at %08lx\n",
++                      (unsigned long)(idx_to_kaddr(tpmif, i) |
++                                      (tx->addr & ~PAGE_MASK)));
++
++              src = (void *)(idx_to_kaddr(tpmif, i) |
++                             ((tx->addr & ~PAGE_MASK) + pg_offset));
++              if (copy_to_buffer(&buffer[offset],
++                                 src, to_copy, isuserbuffer)) {
++                      return -EFAULT;
++              }
++
++              DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n",
++                      tpmif->domid, buffer[offset], buffer[offset + 1],
++                      buffer[offset + 2], buffer[offset + 3]);
++
++              gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
++                                  GNTMAP_host_map, handle);
++
++              if (unlikely
++                  (HYPERVISOR_grant_table_op
++                   (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
++                      BUG();
++              }
++
++              offset += to_copy;
++              pg_offset = 0;
++              last_read += to_copy;
++              room_left -= to_copy;
++
++              to_copy = min_t(u32, PAGE_SIZE, room_left);
++              i++;
++      }                       /* while (to_copy > 0) */
++      /*
++       * Adjust the last_read pointer
++       */
++      pak->last_read = last_read + 4;
++      return offset;
++}
++
++/* ============================================================
++ * The file layer for reading data from this device
++ * ============================================================
++ */
++static int vtpm_op_open(struct inode *inode, struct file *f)
++{
++      int rc = 0;
++      unsigned long flags;
++
++      write_lock_irqsave(&dataex.pak_lock, flags);
++      if (dataex.has_opener == 0) {
++              dataex.has_opener = 1;
++      } else {
++              rc = -EPERM;
++      }
++      write_unlock_irqrestore(&dataex.pak_lock, flags);
++      return rc;
++}
++
++static ssize_t vtpm_op_read(struct file *file,
++                          char __user * data, size_t size, loff_t * offset)
++{
++      int ret_size = -ENODATA;
++      struct packet *pak = NULL;
++      unsigned long flags;
++
++      write_lock_irqsave(&dataex.pak_lock, flags);
++      if (dataex.aborted) {
++              dataex.aborted = 0;
++              dataex.copied_so_far = 0;
++              write_unlock_irqrestore(&dataex.pak_lock, flags);
++              return -EIO;
++      }
++
++      if (list_empty(&dataex.pending_pak)) {
++              write_unlock_irqrestore(&dataex.pak_lock, flags);
++              wait_event_interruptible(dataex.wait_queue,
++                                       !list_empty(&dataex.pending_pak));
++              write_lock_irqsave(&dataex.pak_lock, flags);
++              dataex.copied_so_far = 0;
++      }
++
++      if (!list_empty(&dataex.pending_pak)) {
++              unsigned int left;
++
++              pak = list_entry(dataex.pending_pak.next, struct packet, next);
++              left = pak->data_len - dataex.copied_so_far;
++              list_del(&pak->next);
++              write_unlock_irqrestore(&dataex.pak_lock, flags);
++
++              DPRINTK("size given by app: %zu, available: %u\n", size, left);
++
++              ret_size = min_t(size_t, size, left);
++
++              ret_size = packet_read(pak, ret_size, data, size, 1);
++
++              write_lock_irqsave(&dataex.pak_lock, flags);
++
++              if (ret_size < 0) {
++                      del_singleshot_timer_sync(&pak->processing_timer);
++                      packet_free(pak);
++                      dataex.copied_so_far = 0;
++              } else {
++                      DPRINTK("Copied %d bytes to user buffer\n", ret_size);
++
++                      dataex.copied_so_far += ret_size;
++                      if (dataex.copied_so_far >= pak->data_len + 4) {
++                              DPRINTK("All data from this packet given to app.\n");
++                              /* All data given to app */
++
++                              del_singleshot_timer_sync(&pak->
++                                                        processing_timer);
++                              list_add_tail(&pak->next, &dataex.current_pak);
++                              /*
++                               * The more fontends that are handled at the same time,
++                               * the more time we give the TPM to process the request.
++                               */
++                              mod_timer(&pak->processing_timer,
++                                        jiffies + (num_frontends * 60 * HZ));
++                              dataex.copied_so_far = 0;
++                      } else {
++                              list_add(&pak->next, &dataex.pending_pak);
++                      }
++              }
++      }
++      write_unlock_irqrestore(&dataex.pak_lock, flags);
++
++      DPRINTK("Returning result from read to app: %d\n", ret_size);
++
++      return ret_size;
++}
++
++/*
++ * Write operation - only works after a previous read operation!
++ */
++static ssize_t vtpm_op_write(struct file *file,
++                           const char __user * data, size_t size,
++                           loff_t * offset)
++{
++      struct packet *pak;
++      int rc = 0;
++      unsigned int off = 4;
++      unsigned long flags;
++      struct vtpm_resp_hdr vrh;
++
++      /*
++       * Minimum required packet size is:
++       * 4 bytes for instance number
++       * 2 bytes for tag
++       * 4 bytes for paramSize
++       * 4 bytes for the ordinal
++       * sum: 14 bytes
++       */
++      if (size < sizeof (vrh))
++              return -EFAULT;
++
++      if (copy_from_user(&vrh, data, sizeof (vrh)))
++              return -EFAULT;
++
++      /* malformed packet? */
++      if ((off + ntohl(vrh.len_no)) != size)
++              return -EFAULT;
++
++      write_lock_irqsave(&dataex.pak_lock, flags);
++      pak = packet_find_instance(&dataex.current_pak,
++                                 ntohl(vrh.instance_no));
++
++      if (pak == NULL) {
++              write_unlock_irqrestore(&dataex.pak_lock, flags);
++              DPRINTK(KERN_ALERT "No associated packet! (inst=%d)\n",
++                      ntohl(vrh.instance_no));
++              return -EFAULT;
++      }
++
++      del_singleshot_timer_sync(&pak->processing_timer);
++      list_del(&pak->next);
++
++      write_unlock_irqrestore(&dataex.pak_lock, flags);
++
++      /*
++       * The first 'offset' bytes must be the instance number - skip them.
++       */
++      size -= off;
++
++      rc = packet_write(pak, &data[off], size, 1);
++
++      if (rc > 0) {
++              /* I neglected the first 4 bytes */
++              rc += off;
++      }
++      packet_free(pak);
++      return rc;
++}
++
++static int vtpm_op_release(struct inode *inode, struct file *file)
++{
++      unsigned long flags;
++
++      vtpm_release_packets(NULL, 1);
++      write_lock_irqsave(&dataex.pak_lock, flags);
++      dataex.has_opener = 0;
++      write_unlock_irqrestore(&dataex.pak_lock, flags);
++      return 0;
++}
++
++static unsigned int vtpm_op_poll(struct file *file,
++                               struct poll_table_struct *pts)
++{
++      unsigned int flags = POLLOUT | POLLWRNORM;
++
++      poll_wait(file, &dataex.wait_queue, pts);
++      if (!list_empty(&dataex.pending_pak)) {
++              flags |= POLLIN | POLLRDNORM;
++      }
++      return flags;
++}
++
++static const struct file_operations vtpm_ops = {
++      .owner = THIS_MODULE,
++      .llseek = no_llseek,
++      .open = vtpm_op_open,
++      .read = vtpm_op_read,
++      .write = vtpm_op_write,
++      .release = vtpm_op_release,
++      .poll = vtpm_op_poll,
++};
++
++static struct miscdevice vtpms_miscdevice = {
++      .minor = 225,
++      .name = "vtpm",
++      .fops = &vtpm_ops,
++};
++
++/***************************************************************
++ Utility functions
++***************************************************************/
++
++static int tpm_send_fail_message(struct packet *pak, u8 req_tag)
++{
++      int rc;
++      static const unsigned char tpm_error_message_fail[] = {
++              0x00, 0x00,
++              0x00, 0x00, 0x00, 0x0a,
++              0x00, 0x00, 0x00, 0x09  /* TPM_FAIL */
++      };
++      unsigned char buffer[sizeof (tpm_error_message_fail)];
++
++      memcpy(buffer, tpm_error_message_fail,
++             sizeof (tpm_error_message_fail));
++      /*
++       * Insert the right response tag depending on the given tag
++       * All response tags are '+3' to the request tag.
++       */
++      buffer[1] = req_tag + 3;
++
++      /*
++       * Write the data to shared memory and notify the front-end
++       */
++      rc = packet_write(pak, buffer, sizeof (buffer), 0);
++
++      return rc;
++}
++
++static int _vtpm_release_packets(struct list_head *head,
++                               tpmif_t * tpmif, int send_msgs)
++{
++      int aborted = 0;
++      int c = 0;
++      struct packet *pak;
++      struct list_head *pos, *tmp;
++
++      list_for_each_safe(pos, tmp, head) {
++              pak = list_entry(pos, struct packet, next);
++              c += 1;
++
++              if (tpmif == NULL || pak->tpmif == tpmif) {
++                      int can_send = 0;
++
++                      del_singleshot_timer_sync(&pak->processing_timer);
++                      list_del(&pak->next);
++
++                      if (pak->tpmif && pak->tpmif->status == CONNECTED) {
++                              can_send = 1;
++                      }
++
++                      if (send_msgs && can_send) {
++                              tpm_send_fail_message(pak, pak->req_tag);
++                      }
++                      packet_free(pak);
++                      if (c == 1)
++                              aborted = 1;
++              }
++      }
++      return aborted;
++}
++
++int vtpm_release_packets(tpmif_t * tpmif, int send_msgs)
++{
++      unsigned long flags;
++
++      write_lock_irqsave(&dataex.pak_lock, flags);
++
++      dataex.aborted = _vtpm_release_packets(&dataex.pending_pak,
++                                             tpmif,
++                                             send_msgs);
++      _vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs);
++
++      write_unlock_irqrestore(&dataex.pak_lock, flags);
++      return 0;
++}
++
++static int vtpm_queue_packet(struct packet *pak)
++{
++      int rc = 0;
++
++      if (dataex.has_opener) {
++              unsigned long flags;
++
++              write_lock_irqsave(&dataex.pak_lock, flags);
++              list_add_tail(&pak->next, &dataex.pending_pak);
++              /* give the TPM some time to pick up the request */
++              mod_timer(&pak->processing_timer, jiffies + (30 * HZ));
++              write_unlock_irqrestore(&dataex.pak_lock, flags);
++
++              wake_up_interruptible(&dataex.wait_queue);
++      } else {
++              rc = -EFAULT;
++      }
++      return rc;
++}
++
++static int vtpm_receive(tpmif_t * tpmif, u32 size)
++{
++      int rc = 0;
++      unsigned char buffer[10];
++      __be32 *native_size;
++      struct packet *pak = packet_alloc(tpmif, size, 0, 0);
++
++      if (!pak)
++              return -ENOMEM;
++      /*
++       * Read 10 bytes from the received buffer to test its
++       * content for validity.
++       */
++      if (sizeof (buffer) != packet_read(pak,
++                                         sizeof (buffer), buffer,
++                                         sizeof (buffer), 0)) {
++              goto failexit;
++      }
++      /*
++       * Reset the packet read pointer so we can read all its
++       * contents again.
++       */
++      packet_reset(pak);
++
++      native_size = (__force __be32 *) (&buffer[4 + 2]);
++      /*
++       * Verify that the size of the packet is correct
++       * as indicated and that there's actually someone reading packets.
++       * The minimum size of the packet is '10' for tag, size indicator
++       * and ordinal.
++       */
++      if (size < 10 ||
++          be32_to_cpu(*native_size) != size ||
++          0 == dataex.has_opener || tpmif->status != CONNECTED) {
++              rc = -EINVAL;
++              goto failexit;
++      } else {
++              rc = vtpm_queue_packet(pak);
++              if (rc < 0)
++                      goto failexit;
++      }
++      return 0;
++
++      failexit:
++      if (pak) {
++              tpm_send_fail_message(pak, buffer[4 + 1]);
++              packet_free(pak);
++      }
++      return rc;
++}
++
++/*
++ * Timeout function that gets invoked when a packet has not been processed
++ * during the timeout period.
++ * The packet must be on a list when this function is invoked. This
++ * also means that once its taken off a list, the timer must be
++ * destroyed as well.
++ */
++static void processing_timeout(unsigned long ptr)
++{
++      struct packet *pak = (struct packet *)ptr;
++      unsigned long flags;
++
++      write_lock_irqsave(&dataex.pak_lock, flags);
++      /*
++       * The packet needs to be searched whether it
++       * is still on the list.
++       */
++      if (pak == packet_find_packet(&dataex.pending_pak, pak) ||
++          pak == packet_find_packet(&dataex.current_pak, pak)) {
++              if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) {
++                      tpm_send_fail_message(pak, pak->req_tag);
++              }
++              /* discard future responses */
++              pak->flags |= PACKET_FLAG_DISCARD_RESPONSE;
++      }
++
++      write_unlock_irqrestore(&dataex.pak_lock, flags);
++}
++
++static void tpm_tx_action(unsigned long unused);
++static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0);
++
++static struct list_head tpm_schedule_list;
++static spinlock_t tpm_schedule_list_lock;
++
++static inline void maybe_schedule_tx_action(void)
++{
++      smp_mb();
++      tasklet_schedule(&tpm_tx_tasklet);
++}
++
++static inline int __on_tpm_schedule_list(tpmif_t * tpmif)
++{
++      return tpmif->list.next != NULL;
++}
++
++static void remove_from_tpm_schedule_list(tpmif_t * tpmif)
++{
++      spin_lock_irq(&tpm_schedule_list_lock);
++      if (likely(__on_tpm_schedule_list(tpmif))) {
++              list_del(&tpmif->list);
++              tpmif->list.next = NULL;
++              tpmif_put(tpmif);
++      }
++      spin_unlock_irq(&tpm_schedule_list_lock);
++}
++
++static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif)
++{
++      if (__on_tpm_schedule_list(tpmif))
++              return;
++
++      spin_lock_irq(&tpm_schedule_list_lock);
++      if (!__on_tpm_schedule_list(tpmif) && tpmif->active) {
++              list_add_tail(&tpmif->list, &tpm_schedule_list);
++              tpmif_get(tpmif);
++      }
++      spin_unlock_irq(&tpm_schedule_list_lock);
++}
++
++void tpmif_schedule_work(tpmif_t * tpmif)
++{
++      add_to_tpm_schedule_list_tail(tpmif);
++      maybe_schedule_tx_action();
++}
++
++void tpmif_deschedule_work(tpmif_t * tpmif)
++{
++      remove_from_tpm_schedule_list(tpmif);
++}
++
++static void tpm_tx_action(unsigned long unused)
++{
++      struct list_head *ent;
++      tpmif_t *tpmif;
++      tpmif_tx_request_t *tx;
++
++      DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__);
++
++      while (!list_empty(&tpm_schedule_list)) {
++              /* Get a tpmif from the list with work to do. */
++              ent = tpm_schedule_list.next;
++              tpmif = list_entry(ent, tpmif_t, list);
++              tpmif_get(tpmif);
++              remove_from_tpm_schedule_list(tpmif);
++
++              tx = &tpmif->tx->ring[0].req;
++
++              /* pass it up */
++              vtpm_receive(tpmif, tx->size);
++
++              tpmif_put(tpmif);
++      }
++}
++
++irqreturn_t tpmif_be_int(int irq, void *dev_id)
++{
++      tpmif_t *tpmif = (tpmif_t *) dev_id;
++
++      add_to_tpm_schedule_list_tail(tpmif);
++      maybe_schedule_tx_action();
++      return IRQ_HANDLED;
++}
++
++static int __init tpmback_init(void)
++{
++      int rc;
++
++      if ((rc = misc_register(&vtpms_miscdevice)) != 0) {
++              pr_alert("Could not register misc device for TPM BE\n");
++              return rc;
++      }
++
++      dataex_init(&dataex);
++
++      spin_lock_init(&tpm_schedule_list_lock);
++      INIT_LIST_HEAD(&tpm_schedule_list);
++
++      rc = tpmif_interface_init();
++      if (!rc) {
++              rc = tpmif_xenbus_init();
++              if (rc)
++                      tpmif_interface_exit();
++      }
++      if (rc) {
++              misc_deregister(&vtpms_miscdevice);
++              return rc;
++      }
++
++      pr_alert("Successfully initialized TPM backend driver\n");
++
++      return 0;
++}
++module_init(tpmback_init);
++
++static void __exit tpmback_exit(void)
++{
++      vtpm_release_packets(NULL, 0);
++      tpmif_xenbus_exit();
++      tpmif_interface_exit();
++      misc_deregister(&vtpms_miscdevice);
++}
++module_exit(tpmback_exit)
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/tpmback/xenbus.c

index 0000000,0000000..4533f0f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/tpmback/xenbus.c
@@@ -1,0 -1,0 +1,273 @@@
++/*  Xenbus code for tpmif backend
++    Copyright (C) 2005 IBM Corporation
++    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
++
++    This program is free software; you can redistribute it and/or modify
++    it under the terms of the GNU General Public License as published by
++    the Free Software Foundation; either version 2 of the License, or
++    (at your option) any later version.
++
++    This program is distributed in the hope that it will be useful,
++    but WITHOUT ANY WARRANTY; without even the implied warranty of
++    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++    GNU General Public License for more details.
++
++    You should have received a copy of the GNU General Public License
++    along with this program; if not, write to the Free Software
++    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++*/
++#include <stdarg.h>
++#include <linux/module.h>
++#include <xen/xenbus.h>
++#include "common.h"
++
++static void maybe_connect(struct backend_info *be);
++static void connect(struct backend_info *be);
++static int connect_ring(struct backend_info *be);
++static void backend_changed(struct xenbus_watch *watch,
++                          const char **vec, unsigned int len);
++static void frontend_changed(struct xenbus_device *dev,
++                           enum xenbus_state frontend_state);
++
++long int tpmback_get_instance(struct backend_info *bi)
++{
++      long int res = -1;
++      if (bi && bi->is_instance_set)
++              res = bi->instance;
++      return res;
++}
++
++static int tpmback_remove(struct xenbus_device *dev)
++{
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++      if (!be) return 0;
++
++      if (be->backend_watch.node) {
++              unregister_xenbus_watch(&be->backend_watch);
++              kfree(be->backend_watch.node);
++              be->backend_watch.node = NULL;
++      }
++      if (be->tpmif) {
++              be->tpmif->bi = NULL;
++              vtpm_release_packets(be->tpmif, 0);
++              tpmif_put(be->tpmif);
++              be->tpmif = NULL;
++      }
++      kfree(be);
++      dev_set_drvdata(&dev->dev, NULL);
++      return 0;
++}
++
++static int tpmback_probe(struct xenbus_device *dev,
++                       const struct xenbus_device_id *id)
++{
++      int err;
++      struct backend_info *be = kzalloc(sizeof(struct backend_info),
++                                        GFP_KERNEL);
++
++      if (!be) {
++              xenbus_dev_fatal(dev, -ENOMEM,
++                               "allocating backend structure");
++              return -ENOMEM;
++      }
++
++      be->is_instance_set = 0;
++      be->dev = dev;
++      dev_set_drvdata(&dev->dev, be);
++
++      err = xenbus_watch_path2(dev, dev->nodename,
++                               "instance", &be->backend_watch,
++                               backend_changed);
++      if (err) {
++              goto fail;
++      }
++
++      err = xenbus_switch_state(dev, XenbusStateInitWait);
++      if (err) {
++              goto fail;
++      }
++      return 0;
++fail:
++      tpmback_remove(dev);
++      return err;
++}
++
++
++static void backend_changed(struct xenbus_watch *watch,
++                          const char **vec, unsigned int len)
++{
++      int err;
++      long instance;
++      struct backend_info *be
++              = container_of(watch, struct backend_info, backend_watch);
++      struct xenbus_device *dev = be->dev;
++
++      err = xenbus_scanf(XBT_NIL, dev->nodename,
++                         "instance","%li", &instance);
++      if (XENBUS_EXIST_ERR(err)) {
++              return;
++      }
++
++      if (err != 1) {
++              xenbus_dev_fatal(dev, err, "reading instance");
++              return;
++      }
++
++      if (be->is_instance_set == 0) {
++              be->instance = instance;
++              be->is_instance_set = 1;
++      }
++}
++
++
++static void frontend_changed(struct xenbus_device *dev,
++                           enum xenbus_state frontend_state)
++{
++      struct backend_info *be = dev_get_drvdata(&dev->dev);
++      int err;
++
++      switch (frontend_state) {
++      case XenbusStateInitialising:
++      case XenbusStateInitialised:
++              break;
++
++      case XenbusStateConnected:
++              err = connect_ring(be);
++              if (err) {
++                      return;
++              }
++              maybe_connect(be);
++              break;
++
++      case XenbusStateClosing:
++              be->instance = -1;
++              xenbus_switch_state(dev, XenbusStateClosing);
++              break;
++
++      case XenbusStateUnknown: /* keep it here */
++      case XenbusStateClosed:
++              xenbus_switch_state(dev, XenbusStateClosed);
++              device_unregister(&be->dev->dev);
++              tpmback_remove(dev);
++              break;
++
++      default:
++              xenbus_dev_fatal(dev, -EINVAL,
++                               "saw state %d at frontend",
++                               frontend_state);
++              break;
++      }
++}
++
++
++
++static void maybe_connect(struct backend_info *be)
++{
++      if (be->tpmif == NULL || be->tpmif->status == CONNECTED)
++              return;
++
++      connect(be);
++}
++
++
++static void connect(struct backend_info *be)
++{
++      struct xenbus_transaction xbt;
++      int err;
++      struct xenbus_device *dev = be->dev;
++      unsigned long ready = 1;
++
++again:
++      err = xenbus_transaction_start(&xbt);
++      if (err) {
++              xenbus_dev_fatal(be->dev, err, "starting transaction");
++              return;
++      }
++
++      err = xenbus_printf(xbt, be->dev->nodename,
++                          "ready", "%lu", ready);
++      if (err) {
++              xenbus_dev_fatal(be->dev, err, "writing 'ready'");
++              goto abort;
++      }
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err == -EAGAIN)
++              goto again;
++      if (err)
++              xenbus_dev_fatal(be->dev, err, "end of transaction");
++
++      err = xenbus_switch_state(dev, XenbusStateConnected);
++      if (!err)
++              be->tpmif->status = CONNECTED;
++      return;
++abort:
++      xenbus_transaction_end(xbt, 1);
++}
++
++
++static int connect_ring(struct backend_info *be)
++{
++      struct xenbus_device *dev = be->dev;
++      unsigned long ring_ref;
++      unsigned int evtchn;
++      int err;
++
++      err = xenbus_gather(XBT_NIL, dev->otherend,
++                          "ring-ref", "%lu", &ring_ref,
++                          "event-channel", "%u", &evtchn, NULL);
++      if (err) {
++              xenbus_dev_error(dev, err,
++                               "reading %s/ring-ref and event-channel",
++                               dev->otherend);
++              return err;
++      }
++
++      if (!be->tpmif) {
++              be->tpmif = tpmif_find(dev->otherend_id, be);
++              if (IS_ERR(be->tpmif)) {
++                      err = PTR_ERR(be->tpmif);
++                      be->tpmif = NULL;
++                      xenbus_dev_fatal(dev,err,"creating vtpm interface");
++                      return err;
++              }
++      }
++
++      if (be->tpmif != NULL) {
++              err = tpmif_map(be->tpmif, ring_ref, evtchn);
++              if (err) {
++                      xenbus_dev_error(dev, err,
++                                       "mapping shared-frame %lu port %u",
++                                       ring_ref, evtchn);
++                      return err;
++              }
++      }
++      return 0;
++}
++
++
++static const struct xenbus_device_id tpmback_ids[] = {
++      { "vtpm" },
++      { "" }
++};
++
++
++static struct xenbus_driver tpmback = {
++      .name = "vtpm",
++      .ids = tpmback_ids,
++      .probe = tpmback_probe,
++      .remove = tpmback_remove,
++      .otherend_changed = frontend_changed,
++};
++
++
++int tpmif_xenbus_init(void)
++{
++      return xenbus_register_backend(&tpmback);
++}
++
++void tpmif_xenbus_exit(void)
++{
++      xenbus_unregister_driver(&tpmback);
++}
diff --cc drivers/xen/usbback/Makefile

index 0000000,0000000..a7548cb

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbback/Makefile
@@@ -1,0 -1,0 +1,4 @@@
++obj-$(CONFIG_XEN_USB_BACKEND) := usbbk.o
++
++usbbk-y   := usbstub.o xenbus.o interface.o usbback.o
++
diff --cc drivers/xen/usbback/interface.c

index 0000000,0000000..bd22277

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbback/interface.c
@@@ -1,0 -1,0 +1,190 @@@
++/*
++ * interface.c
++ *
++ * Xen USB backend interface management.
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ *
++ * or, by your choice,
++ *
++ * When distributed separately from the Linux kernel or incorporated into
++ * other software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#include <linux/vmalloc.h>
++#include "usbback.h"
++#include <xen/evtchn.h>
++
++static LIST_HEAD(usbif_list);
++static DEFINE_SPINLOCK(usbif_list_lock);
++
++usbif_t *find_usbif(domid_t domid, unsigned int handle)
++{
++      usbif_t *usbif;
++      int found = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&usbif_list_lock, flags);
++      list_for_each_entry(usbif, &usbif_list, usbif_list) {
++              if (usbif->domid == domid
++                      && usbif->handle == handle) {
++                      found = 1;
++                      break;
++              }
++      }
++      spin_unlock_irqrestore(&usbif_list_lock, flags);
++
++      if (found)
++              return usbif;
++
++      return NULL;
++}
++
++usbif_t *usbif_alloc(domid_t domid, unsigned int handle)
++{
++      usbif_t *usbif;
++      unsigned long flags;
++      int i;
++
++      usbif = kzalloc(sizeof(usbif_t), GFP_KERNEL);
++      if (!usbif)
++              return NULL;
++
++      usbif->domid = domid;
++      usbif->handle = handle;
++      spin_lock_init(&usbif->urb_ring_lock);
++      spin_lock_init(&usbif->conn_ring_lock);
++      atomic_set(&usbif->refcnt, 0);
++      init_waitqueue_head(&usbif->wq);
++      init_waitqueue_head(&usbif->waiting_to_free);
++      spin_lock_init(&usbif->stub_lock);
++      INIT_LIST_HEAD(&usbif->stub_list);
++      spin_lock_init(&usbif->addr_lock);
++      for (i = 0; i < USB_DEV_ADDR_SIZE; i++)
++              usbif->addr_table[i] = NULL;
++
++      spin_lock_irqsave(&usbif_list_lock, flags);
++      list_add(&usbif->usbif_list, &usbif_list);
++      spin_unlock_irqrestore(&usbif_list_lock, flags);
++
++      return usbif;
++}
++
++int usbif_map(usbif_t *usbif, grant_ref_t urb_ring_ref,
++            grant_ref_t conn_ring_ref, evtchn_port_t evtchn)
++{
++      int err = -ENOMEM;
++      struct vm_struct *area;
++      usbif_urb_sring_t *urb_sring;
++      usbif_conn_sring_t *conn_sring;
++
++      if (usbif->irq)
++              return 0;
++
++      area = xenbus_map_ring_valloc(usbif->xbdev, urb_ring_ref);
++      if (IS_ERR(area))
++              return PTR_ERR(area);
++      usbif->urb_ring_area = area;
++      area = xenbus_map_ring_valloc(usbif->xbdev, conn_ring_ref);
++      if (IS_ERR(area)) {
++              err = PTR_ERR(area);
++              goto fail_alloc;
++      }
++      usbif->conn_ring_area = area;
++
++      err = bind_interdomain_evtchn_to_irqhandler(
++                      usbif->domid, evtchn, usbbk_be_int, 0,
++                      "usbif-backend", usbif);
++      if (err < 0)
++              goto fail_evtchn;
++      usbif->irq = err;
++
++      urb_sring = (usbif_urb_sring_t *) usbif->urb_ring_area->addr;
++      BACK_RING_INIT(&usbif->urb_ring, urb_sring, PAGE_SIZE);
++
++      conn_sring = (usbif_conn_sring_t *) usbif->conn_ring_area->addr;
++      BACK_RING_INIT(&usbif->conn_ring, conn_sring, PAGE_SIZE);
++
++      return 0;
++
++fail_evtchn:
++      xenbus_unmap_ring_vfree(usbif->xbdev, usbif->conn_ring_area);
++fail_alloc:
++      xenbus_unmap_ring_vfree(usbif->xbdev, usbif->urb_ring_area);
++
++      return err;
++}
++
++void usbif_disconnect(usbif_t *usbif)
++{
++      struct usbstub *stub, *tmp;
++      unsigned long flags;
++
++      if (usbif->xenusbd) {
++              kthread_stop(usbif->xenusbd);
++              usbif->xenusbd = NULL;
++      }
++
++      spin_lock_irqsave(&usbif->stub_lock, flags);
++      list_for_each_entry_safe(stub, tmp, &usbif->stub_list, dev_list) {
++              usbbk_unlink_urbs(stub);
++              detach_device_without_lock(usbif, stub);
++      }
++      spin_unlock_irqrestore(&usbif->stub_lock, flags);
++
++      wait_event(usbif->waiting_to_free, atomic_read(&usbif->refcnt) == 0);
++
++      if (usbif->irq) {
++              unbind_from_irqhandler(usbif->irq, usbif);
++              usbif->irq = 0;
++      }
++
++      if (usbif->urb_ring.sring) {
++              xenbus_unmap_ring_vfree(usbif->xbdev, usbif->urb_ring_area);
++              xenbus_unmap_ring_vfree(usbif->xbdev, usbif->conn_ring_area);
++              usbif->urb_ring.sring = NULL;
++              usbif->conn_ring.sring = NULL;
++      }
++}
++
++void usbif_free(usbif_t *usbif)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&usbif_list_lock, flags);
++      list_del(&usbif->usbif_list);
++      spin_unlock_irqrestore(&usbif_list_lock, flags);
++      kfree(usbif);
++}
diff --cc drivers/xen/usbback/usbback.c

index 0000000,0000000..b5eb834

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbback/usbback.c
@@@ -1,0 -1,0 +1,1197 @@@
++/*
++ * usbback.c
++ *
++ * Xen USB backend driver
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ *
++ * or, by your choice,
++ *
++ * When distributed separately from the Linux kernel or incorporated into
++ * other software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#include <linux/mm.h>
++#include <xen/balloon.h>
++#include <xen/evtchn.h>
++#include <xen/gnttab.h>
++#include "usbback.h"
++
++#if 0
++#include "../../usb/core/hub.h"
++#endif
++
++int usbif_reqs = USBIF_BACK_MAX_PENDING_REQS;
++module_param_named(reqs, usbif_reqs, int, 0);
++MODULE_PARM_DESC(reqs, "Number of usbback requests to allocate");
++
++struct pending_req_segment {
++      uint16_t offset;
++      uint16_t length;
++};
++
++typedef struct {
++      usbif_t *usbif;
++
++      uint16_t id; /* request id */
++
++      struct usbstub *stub;
++      struct list_head urb_list;
++
++      /* urb */
++      struct urb *urb;
++      void *buffer;
++      dma_addr_t transfer_dma;
++      struct usb_ctrlrequest *setup;
++
++      /* request segments */
++      uint16_t nr_buffer_segs; /* number of urb->transfer_buffer segments */
++      uint16_t nr_extra_segs; /* number of iso_frame_desc segments (ISO) */
++      struct pending_req_segment *seg;
++
++      struct list_head free_list;
++} pending_req_t;
++
++static pending_req_t *pending_reqs;
++static struct list_head pending_free;
++static DEFINE_SPINLOCK(pending_free_lock);
++static LIST_HEAD(pending_urb_free);
++static DEFINE_SPINLOCK(urb_free_lock);
++static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
++
++#define USBBACK_INVALID_HANDLE (~0)
++
++static struct page **pending_pages;
++static grant_handle_t *pending_grant_handles;
++
++static inline int vaddr_pagenr(pending_req_t *req, int seg)
++{
++      return (req - pending_reqs) * USBIF_MAX_SEGMENTS_PER_REQUEST + seg;
++}
++
++static inline unsigned long vaddr(pending_req_t *req, int seg)
++{
++      unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
++      return (unsigned long)pfn_to_kaddr(pfn);
++}
++
++#define pending_handle(_req, _seg) \
++      (pending_grant_handles[vaddr_pagenr(_req, _seg)])
++
++static pending_req_t *alloc_req(void)
++{
++      pending_req_t *req = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&pending_free_lock, flags);
++      if (!list_empty(&pending_free)) {
++              req = list_entry(pending_free.next, pending_req_t, free_list);
++              list_del(&req->free_list);
++      }
++      spin_unlock_irqrestore(&pending_free_lock, flags);
++      return req;
++}
++
++static void free_req(pending_req_t *req)
++{
++      unsigned long flags;
++      int was_empty;
++
++      spin_lock_irqsave(&pending_free_lock, flags);
++      was_empty = list_empty(&pending_free);
++      list_add(&req->free_list, &pending_free);
++      spin_unlock_irqrestore(&pending_free_lock, flags);
++      if (was_empty)
++              wake_up(&pending_free_wq);
++}
++
++static inline void add_req_to_submitting_list(struct usbstub *stub, pending_req_t *pending_req)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&stub->submitting_lock, flags);
++      list_add_tail(&pending_req->urb_list, &stub->submitting_list);
++      spin_unlock_irqrestore(&stub->submitting_lock, flags);
++}
++
++static inline void remove_req_from_submitting_list(struct usbstub *stub, pending_req_t *pending_req)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&stub->submitting_lock, flags);
++      list_del_init(&pending_req->urb_list);
++      spin_unlock_irqrestore(&stub->submitting_lock, flags);
++}
++
++void usbbk_unlink_urbs(struct usbstub *stub)
++{
++      pending_req_t *req, *tmp;
++      unsigned long flags;
++
++      spin_lock_irqsave(&stub->submitting_lock, flags);
++      list_for_each_entry_safe(req, tmp, &stub->submitting_list, urb_list) {
++              usb_unlink_urb(req->urb);
++      }
++      spin_unlock_irqrestore(&stub->submitting_lock, flags);
++}
++
++static void fast_flush_area(pending_req_t *pending_req)
++{
++      struct gnttab_unmap_grant_ref unmap[USBIF_MAX_SEGMENTS_PER_REQUEST];
++      unsigned int i, nr_segs, invcount = 0;
++      grant_handle_t handle;
++      int ret;
++
++      nr_segs = pending_req->nr_buffer_segs + pending_req->nr_extra_segs;
++
++      if (nr_segs) {
++              for (i = 0; i < nr_segs; i++) {
++                      handle = pending_handle(pending_req, i);
++                      if (handle == USBBACK_INVALID_HANDLE)
++                              continue;
++                      gnttab_set_unmap_op(&unmap[invcount], vaddr(pending_req, i),
++                                          GNTMAP_host_map, handle);
++                      pending_handle(pending_req, i) = USBBACK_INVALID_HANDLE;
++                      invcount++;
++              }
++
++              ret = HYPERVISOR_grant_table_op(
++                      GNTTABOP_unmap_grant_ref, unmap, invcount);
++              BUG_ON(ret);
++
++              kfree(pending_req->seg);
++      }
++
++      return;
++}
++
++static void copy_buff_to_pages(void *buff, pending_req_t *pending_req,
++              int start, int nr_pages)
++{
++      unsigned long copied = 0;
++      int i;
++
++      for (i = start; i < start + nr_pages; i++) {
++              memcpy((void *) vaddr(pending_req, i) + pending_req->seg[i].offset,
++                      buff + copied,
++                      pending_req->seg[i].length);
++              copied += pending_req->seg[i].length;
++      }
++}
++
++static void copy_pages_to_buff(void *buff, pending_req_t *pending_req,
++              int start, int nr_pages)
++{
++      unsigned long copied = 0;
++      int i;
++
++      for (i = start; i < start + nr_pages; i++) {
++              memcpy(buff + copied,
++                      (void *) vaddr(pending_req, i) + pending_req->seg[i].offset,
++                      pending_req->seg[i].length);
++              copied += pending_req->seg[i].length;
++      }
++}
++
++static int usbbk_alloc_urb(usbif_urb_request_t *req, pending_req_t *pending_req)
++{
++      int ret;
++
++      if (usb_pipeisoc(req->pipe))
++              pending_req->urb = usb_alloc_urb(req->u.isoc.number_of_packets, GFP_KERNEL);
++      else
++              pending_req->urb = usb_alloc_urb(0, GFP_KERNEL);
++      if (!pending_req->urb) {
++              pr_err("usbback: can't alloc urb\n");
++              ret = -ENOMEM;
++              goto fail;
++      }
++
++      if (req->buffer_length) {
++              pending_req->buffer = usb_alloc_coherent(pending_req->stub->udev,
++                              req->buffer_length, GFP_KERNEL,
++                              &pending_req->transfer_dma);
++              if (!pending_req->buffer) {
++                      pr_err("usbback: can't alloc urb buffer\n");
++                      ret = -ENOMEM;
++                      goto fail_free_urb;
++              }
++      }
++
++      if (usb_pipecontrol(req->pipe)) {
++              pending_req->setup = kmalloc(sizeof(struct usb_ctrlrequest),
++                                           GFP_KERNEL);
++              if (!pending_req->setup) {
++                      pr_err("usbback: can't alloc usb_ctrlrequest\n");
++                      ret = -ENOMEM;
++                      goto fail_free_buffer;
++              }
++      }
++
++      return 0;
++
++fail_free_buffer:
++      if (req->buffer_length)
++              usb_free_coherent(pending_req->stub->udev,
++                                req->buffer_length,
++                                pending_req->buffer,
++                                pending_req->transfer_dma);
++fail_free_urb:
++      usb_free_urb(pending_req->urb);
++fail:
++      return ret;
++}
++
++static void usbbk_free_urb(struct urb *urb)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&urb_free_lock, flags);
++      list_add(&urb->urb_list, &pending_urb_free);
++      spin_unlock_irqrestore(&urb_free_lock, flags);
++}
++
++static void _usbbk_free_urb(struct urb *urb)
++{
++      if (usb_pipecontrol(urb->pipe))
++              kfree(urb->setup_packet);
++      if (urb->transfer_buffer_length)
++              usb_free_coherent(urb->dev, urb->transfer_buffer_length,
++                                urb->transfer_buffer, urb->transfer_dma);
++      barrier();
++      usb_free_urb(urb);
++}
++
++static void usbbk_free_urbs(void)
++{
++      unsigned long flags;
++      struct list_head tmp_list;
++
++      if (list_empty(&pending_urb_free))
++              return;
++
++      INIT_LIST_HEAD(&tmp_list);
++
++      spin_lock_irqsave(&urb_free_lock, flags);
++      list_splice_init(&pending_urb_free, &tmp_list);
++      spin_unlock_irqrestore(&urb_free_lock, flags);
++
++      while (!list_empty(&tmp_list)) {
++              struct urb *next_urb = list_first_entry(&tmp_list, struct urb,
++                                                      urb_list);
++
++              list_del(&next_urb->urb_list);
++              _usbbk_free_urb(next_urb);
++      }
++}
++
++static void usbbk_notify_work(usbif_t *usbif)
++{
++      usbif->waiting_reqs = 1;
++      wake_up(&usbif->wq);
++}
++
++irqreturn_t usbbk_be_int(int irq, void *dev_id)
++{
++      usbbk_notify_work(dev_id);
++      return IRQ_HANDLED;
++}
++
++static void usbbk_do_response(pending_req_t *pending_req, int32_t status,
++                                      int32_t actual_length, int32_t error_count, uint16_t start_frame)
++{
++      usbif_t *usbif = pending_req->usbif;
++      usbif_urb_response_t *res;
++      unsigned long flags;
++      int notify;
++
++      spin_lock_irqsave(&usbif->urb_ring_lock, flags);
++      res = RING_GET_RESPONSE(&usbif->urb_ring, usbif->urb_ring.rsp_prod_pvt);
++      res->id = pending_req->id;
++      res->status = status;
++      res->actual_length = actual_length;
++      res->error_count = error_count;
++      res->start_frame = start_frame;
++      usbif->urb_ring.rsp_prod_pvt++;
++      barrier();
++      RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&usbif->urb_ring, notify);
++      spin_unlock_irqrestore(&usbif->urb_ring_lock, flags);
++
++      if (notify)
++              notify_remote_via_irq(usbif->irq);
++}
++
++static void usbbk_urb_complete(struct urb *urb)
++{
++      pending_req_t *pending_req = (pending_req_t *)urb->context;
++
++      if (usb_pipein(urb->pipe) && urb->status == 0 && urb->actual_length > 0)
++              copy_buff_to_pages(pending_req->buffer, pending_req,
++                                      0, pending_req->nr_buffer_segs);
++
++      if (usb_pipeisoc(urb->pipe))
++              copy_buff_to_pages(&urb->iso_frame_desc[0], pending_req,
++                                      pending_req->nr_buffer_segs, pending_req->nr_extra_segs);
++
++      barrier();
++
++      fast_flush_area(pending_req);
++
++      usbbk_do_response(pending_req, urb->status, urb->actual_length,
++                                      urb->error_count, urb->start_frame);
++
++      remove_req_from_submitting_list(pending_req->stub, pending_req);
++
++      barrier();
++      usbbk_free_urb(urb);
++      usbif_put(pending_req->usbif);
++      free_req(pending_req);
++}
++
++static int usbbk_gnttab_map(usbif_t *usbif,
++                      usbif_urb_request_t *req, pending_req_t *pending_req)
++{
++      int i, ret;
++      unsigned int nr_segs;
++      uint32_t flags;
++      struct gnttab_map_grant_ref map[USBIF_MAX_SEGMENTS_PER_REQUEST];
++
++      nr_segs = pending_req->nr_buffer_segs + pending_req->nr_extra_segs;
++
++      if (nr_segs > USBIF_MAX_SEGMENTS_PER_REQUEST) {
++              pr_err("Bad number of segments in request\n");
++              ret = -EINVAL;
++              goto fail;
++      }
++
++      if (nr_segs) {
++              pending_req->seg = kmalloc(sizeof(struct pending_req_segment)
++                              * nr_segs, GFP_KERNEL);
++              if (!pending_req->seg) {
++                      ret = -ENOMEM;
++                      goto fail;
++              }
++
++              if (pending_req->nr_buffer_segs) {
++                      flags = GNTMAP_host_map;
++                      if (usb_pipeout(req->pipe))
++                              flags |= GNTMAP_readonly;
++                      for (i = 0; i < pending_req->nr_buffer_segs; i++)
++                              gnttab_set_map_op(&map[i], vaddr(
++                                              pending_req, i), flags,
++                                              req->seg[i].gref,
++                                              usbif->domid);
++              }
++
++              if (pending_req->nr_extra_segs) {
++                      flags = GNTMAP_host_map;
++                      for (i = req->nr_buffer_segs; i < nr_segs; i++)
++                              gnttab_set_map_op(&map[i], vaddr(
++                                              pending_req, i), flags,
++                                              req->seg[i].gref,
++                                              usbif->domid);
++              }
++
++              ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++                                      map, nr_segs);
++              BUG_ON(ret);
++
++              for (i = 0; i < nr_segs; i++) {
++                      /* Make sure than none of the map ops failed with GNTST_eagain */
++                      if (unlikely(map[i].status == GNTST_eagain))
++                              gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
++
++                      if (unlikely(map[i].status != GNTST_okay)) {
++                              pr_err("usbback: invalid buffer -- could not remap it\n");
++                              map[i].handle = USBBACK_INVALID_HANDLE;
++                              ret |= 1;
++                      }
++
++                      pending_handle(pending_req, i) = map[i].handle;
++
++                      if (ret)
++                              continue;
++
++                      set_phys_to_machine(__pa(vaddr(
++                              pending_req, i)) >> PAGE_SHIFT,
++                              FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
++
++                      pending_req->seg[i].offset = req->seg[i].offset;
++                      pending_req->seg[i].length = req->seg[i].length;
++
++                      barrier();
++
++                      if (pending_req->seg[i].offset >= PAGE_SIZE ||
++                                      pending_req->seg[i].length > PAGE_SIZE ||
++                                      pending_req->seg[i].offset + pending_req->seg[i].length > PAGE_SIZE)
++                                      ret |= 1;
++              }
++
++              if (ret)
++                      goto fail_flush;
++      }
++
++      return 0;
++
++fail_flush:
++      fast_flush_area(pending_req);
++      ret = -ENOMEM;
++
++fail:
++      return ret;
++}
++
++static void usbbk_init_urb(usbif_urb_request_t *req, pending_req_t *pending_req)
++{
++      unsigned int pipe;
++      struct usb_device *udev = pending_req->stub->udev;
++      struct urb *urb = pending_req->urb;
++
++      switch (usb_pipetype(req->pipe)) {
++      case PIPE_ISOCHRONOUS:
++              if (usb_pipein(req->pipe))
++                      pipe = usb_rcvisocpipe(udev, usb_pipeendpoint(req->pipe));
++              else
++                      pipe = usb_sndisocpipe(udev, usb_pipeendpoint(req->pipe));
++
++              urb->dev = udev;
++              urb->pipe = pipe;
++              urb->transfer_flags = req->transfer_flags;
++              urb->transfer_flags |= URB_ISO_ASAP;
++              urb->transfer_buffer = pending_req->buffer;
++              urb->transfer_buffer_length = req->buffer_length;
++              urb->complete = usbbk_urb_complete;
++              urb->context = pending_req;
++              urb->interval = req->u.isoc.interval;
++              urb->start_frame = req->u.isoc.start_frame;
++              urb->number_of_packets = req->u.isoc.number_of_packets;
++
++              break;
++      case PIPE_INTERRUPT:
++              if (usb_pipein(req->pipe))
++                      pipe = usb_rcvintpipe(udev, usb_pipeendpoint(req->pipe));
++              else
++                      pipe = usb_sndintpipe(udev, usb_pipeendpoint(req->pipe));
++
++              usb_fill_int_urb(urb, udev, pipe,
++                              pending_req->buffer, req->buffer_length,
++                              usbbk_urb_complete,
++                              pending_req, req->u.intr.interval);
++              /*
++               * high speed interrupt endpoints use a logarithmic encoding of
++               * the endpoint interval, and usb_fill_int_urb() initializes a
++               * interrupt urb with the encoded interval value.
++               *
++               * req->u.intr.interval is the interval value that already
++               * encoded in the frontend part, and the above usb_fill_int_urb()
++               * initializes the urb->interval with double encoded value.
++               *
++               * so, simply overwrite the urb->interval with original value.
++               */
++              urb->interval = req->u.intr.interval;
++              urb->transfer_flags = req->transfer_flags;
++
++              break;
++      case PIPE_CONTROL:
++              if (usb_pipein(req->pipe))
++                      pipe = usb_rcvctrlpipe(udev, 0);
++              else
++                      pipe = usb_sndctrlpipe(udev, 0);
++
++              usb_fill_control_urb(urb, udev, pipe,
++                              (unsigned char *) pending_req->setup,
++                              pending_req->buffer, req->buffer_length,
++                              usbbk_urb_complete, pending_req);
++              memcpy(pending_req->setup, req->u.ctrl, 8);
++              urb->transfer_flags = req->transfer_flags;
++
++              break;
++      case PIPE_BULK:
++              if (usb_pipein(req->pipe))
++                      pipe = usb_rcvbulkpipe(udev, usb_pipeendpoint(req->pipe));
++              else
++                      pipe = usb_sndbulkpipe(udev, usb_pipeendpoint(req->pipe));
++
++              usb_fill_bulk_urb(urb, udev, pipe,
++                              pending_req->buffer, req->buffer_length,
++                              usbbk_urb_complete, pending_req);
++              urb->transfer_flags = req->transfer_flags;
++
++              break;
++      default:
++              break;
++      }
++
++      if (req->buffer_length) {
++              urb->transfer_dma = pending_req->transfer_dma;
++              urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
++      }
++}
++
++struct set_interface_request {
++      pending_req_t *pending_req;
++      int interface;
++      int alternate;
++      struct work_struct work;
++};
++
++static void usbbk_set_interface_work(struct work_struct *arg)
++{
++      struct set_interface_request *req
++              = container_of(arg, struct set_interface_request, work);
++      pending_req_t *pending_req = req->pending_req;
++      struct usb_device *udev = req->pending_req->stub->udev;
++
++      int ret;
++
++      usb_lock_device(udev);
++      ret = usb_set_interface(udev, req->interface, req->alternate);
++      usb_unlock_device(udev);
++      usb_put_dev(udev);
++
++      usbbk_do_response(pending_req, ret, 0, 0, 0);
++      usbif_put(pending_req->usbif);
++      free_req(pending_req);
++      kfree(req);
++}
++
++static int usbbk_set_interface(pending_req_t *pending_req, int interface, int alternate)
++{
++      struct set_interface_request *req;
++      struct usb_device *udev = pending_req->stub->udev;
++
++      req = kmalloc(sizeof(*req), GFP_KERNEL);
++      if (!req)
++              return -ENOMEM;
++      req->pending_req = pending_req;
++      req->interface = interface;
++      req->alternate = alternate;
++      INIT_WORK(&req->work, usbbk_set_interface_work);
++      usb_get_dev(udev);
++      schedule_work(&req->work);
++      return 0;
++}
++
++struct clear_halt_request {
++      pending_req_t *pending_req;
++      int pipe;
++      struct work_struct work;
++};
++
++static void usbbk_clear_halt_work(struct work_struct *arg)
++{
++      struct clear_halt_request *req
++              = container_of(arg, struct clear_halt_request, work);
++      pending_req_t *pending_req = req->pending_req;
++      struct usb_device *udev = req->pending_req->stub->udev;
++      int ret;
++
++      usb_lock_device(udev);
++      ret = usb_clear_halt(req->pending_req->stub->udev, req->pipe);
++      usb_unlock_device(udev);
++      usb_put_dev(udev);
++
++      usbbk_do_response(pending_req, ret, 0, 0, 0);
++      usbif_put(pending_req->usbif);
++      free_req(pending_req);
++      kfree(req);
++}
++
++static int usbbk_clear_halt(pending_req_t *pending_req, int pipe)
++{
++      struct clear_halt_request *req;
++      struct usb_device *udev = pending_req->stub->udev;
++
++      req = kmalloc(sizeof(*req), GFP_KERNEL);
++      if (!req)
++              return -ENOMEM;
++      req->pending_req = pending_req;
++      req->pipe = pipe;
++      INIT_WORK(&req->work, usbbk_clear_halt_work);
++
++      usb_get_dev(udev);
++      schedule_work(&req->work);
++      return 0;
++}
++
++#if 0
++struct port_reset_request {
++      pending_req_t *pending_req;
++      struct work_struct work;
++};
++
++static void usbbk_port_reset_work(struct work_struct *arg)
++{
++      struct port_reset_request *req
++              = container_of(arg, struct port_reset_request, work);
++      pending_req_t *pending_req = req->pending_req;
++      struct usb_device *udev = pending_req->stub->udev;
++      int ret, ret_lock;
++
++      ret = ret_lock = usb_lock_device_for_reset(udev, NULL);
++      if (ret_lock >= 0) {
++              ret = usb_reset_device(udev);
++              if (ret_lock)
++                      usb_unlock_device(udev);
++      }
++      usb_put_dev(udev);
++
++      usbbk_do_response(pending_req, ret, 0, 0, 0);
++      usbif_put(pending_req->usbif);
++      free_req(pending_req);
++      kfree(req);
++}
++
++static int usbbk_port_reset(pending_req_t *pending_req)
++{
++      struct port_reset_request *req;
++      struct usb_device *udev = pending_req->stub->udev;
++
++      req = kmalloc(sizeof(*req), GFP_KERNEL);
++      if (!req)
++              return -ENOMEM;
++
++      req->pending_req = pending_req;
++      INIT_WORK(&req->work, usbbk_port_reset_work);
++
++      usb_get_dev(udev);
++      schedule_work(&req->work);
++      return 0;
++}
++#endif
++
++static void usbbk_set_address(usbif_t *usbif, struct usbstub *stub, int cur_addr, int new_addr)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&usbif->addr_lock, flags);
++      if (cur_addr)
++              usbif->addr_table[cur_addr] = NULL;
++      if (new_addr)
++              usbif->addr_table[new_addr] = stub;
++      stub->addr = new_addr;
++      spin_unlock_irqrestore(&usbif->addr_lock, flags);
++}
++
++struct usbstub *find_attached_device(usbif_t *usbif, int portnum)
++{
++      struct usbstub *stub;
++      int found = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&usbif->stub_lock, flags);
++      list_for_each_entry(stub, &usbif->stub_list, dev_list) {
++              if (stub->portid->portnum == portnum) {
++                      found = 1;
++                      break;
++              }
++      }
++      spin_unlock_irqrestore(&usbif->stub_lock, flags);
++
++      if (found)
++              return stub;
++
++      return NULL;
++}
++
++static void process_unlink_req(usbif_t *usbif,
++              usbif_urb_request_t *req, pending_req_t *pending_req)
++{
++      pending_req_t *unlink_req = NULL;
++      int devnum;
++      int ret = 0;
++      unsigned long flags;
++
++      devnum = usb_pipedevice(req->pipe);
++      if (unlikely(devnum == 0)) {
++              pending_req->stub = find_attached_device(usbif, usbif_pipeportnum(req->pipe));
++              if (unlikely(!pending_req->stub)) {
++                      ret = -ENODEV;
++                      goto fail_response;
++              }
++      } else {
++              if (unlikely(!usbif->addr_table[devnum])) {
++                      ret = -ENODEV;
++                      goto fail_response;
++              }
++              pending_req->stub = usbif->addr_table[devnum];
++      }
++
++      spin_lock_irqsave(&pending_req->stub->submitting_lock, flags);
++      list_for_each_entry(unlink_req, &pending_req->stub->submitting_list, urb_list) {
++              if (unlink_req->id == req->u.unlink.unlink_id) {
++                      ret = usb_unlink_urb(unlink_req->urb);
++                      break;
++              }
++      }
++      spin_unlock_irqrestore(&pending_req->stub->submitting_lock, flags);
++
++fail_response:
++      usbbk_do_response(pending_req, ret, 0, 0, 0);
++      usbif_put(usbif);
++      free_req(pending_req);
++      return;
++}
++
++static int check_and_submit_special_ctrlreq(usbif_t *usbif,
++              usbif_urb_request_t *req, pending_req_t *pending_req)
++{
++      int devnum;
++      struct usbstub *stub = NULL;
++      struct usb_ctrlrequest *ctrl = (struct usb_ctrlrequest *) req->u.ctrl;
++      int ret;
++      int done = 0;
++
++      devnum = usb_pipedevice(req->pipe);
++
++      /*
++       * When the device is first connected or reseted, USB device has no address.
++       * In this initial state, following requests are send to device address (#0),
++       *
++       *  1. GET_DESCRIPTOR (with Descriptor Type is "DEVICE") is send,
++       *     and OS knows what device is connected to.
++       *
++       *  2. SET_ADDRESS is send, and then, device has its address.
++       *
++       * In the next step, SET_CONFIGURATION is send to addressed device, and then,
++       * the device is finally ready to use.
++       */
++      if (unlikely(devnum == 0)) {
++              stub = find_attached_device(usbif, usbif_pipeportnum(req->pipe));
++              if (unlikely(!stub)) {
++                      ret = -ENODEV;
++                      goto fail_response;
++              }
++
++              switch (ctrl->bRequest) {
++              case USB_REQ_GET_DESCRIPTOR:
++                      /*
++                       * GET_DESCRIPTOR request to device #0.
++                       * through to normal urb transfer.
++                       */
++                      pending_req->stub = stub;
++                      return 0;
++                      break;
++              case USB_REQ_SET_ADDRESS:
++                      /*
++                       * SET_ADDRESS request to device #0.
++                       * add attached device to addr_table.
++                       */
++                      {
++                              __u16 addr = le16_to_cpu(ctrl->wValue);
++                              usbbk_set_address(usbif, stub, 0, addr);
++                      }
++                      ret = 0;
++                      goto fail_response;
++                      break;
++              default:
++                      ret = -EINVAL;
++                      goto fail_response;
++              }
++      } else {
++              if (unlikely(!usbif->addr_table[devnum])) {
++                      ret = -ENODEV;
++                      goto fail_response;
++              }
++              pending_req->stub = usbif->addr_table[devnum];
++      }
++
++      /*
++       * Check special request
++       */
++      switch (ctrl->bRequest) {
++      case USB_REQ_SET_ADDRESS:
++              /*
++               * SET_ADDRESS request to addressed device.
++               * change addr or remove from addr_table.
++               */
++              {
++                      __u16 addr = le16_to_cpu(ctrl->wValue);
++                      usbbk_set_address(usbif, stub, devnum, addr);
++              }
++              ret = 0;
++              goto fail_response;
++              break;
++#if 0
++      case USB_REQ_SET_CONFIGURATION:
++              /*
++               * linux 2.6.27 or later version only!
++               */
++              if (ctrl->RequestType == USB_RECIP_DEVICE) {
++                      __u16 config = le16_to_cpu(ctrl->wValue);
++                      usb_driver_set_configuration(pending_req->stub->udev, config);
++                      done = 1;
++              }
++              break;
++#endif
++      case USB_REQ_SET_INTERFACE:
++              if (ctrl->bRequestType == USB_RECIP_INTERFACE) {
++                      __u16 alt = le16_to_cpu(ctrl->wValue);
++                      __u16 intf = le16_to_cpu(ctrl->wIndex);
++                      usbbk_set_interface(pending_req, intf, alt);
++                      done = 1;
++              }
++              break;
++      case USB_REQ_CLEAR_FEATURE:
++              if (ctrl->bRequestType == USB_RECIP_ENDPOINT
++                      && ctrl->wValue == USB_ENDPOINT_HALT) {
++                      int pipe;
++                      int ep = le16_to_cpu(ctrl->wIndex) & 0x0f;
++                      int dir = le16_to_cpu(ctrl->wIndex)
++                                      & USB_DIR_IN;
++                      if (dir)
++                              pipe = usb_rcvctrlpipe(pending_req->stub->udev, ep);
++                      else
++                              pipe = usb_sndctrlpipe(pending_req->stub->udev, ep);
++                      usbbk_clear_halt(pending_req, pipe);
++                      done = 1;
++              }
++              break;
++#if 0 /* not tested yet */
++      case USB_REQ_SET_FEATURE:
++              if (ctrl->bRequestType == USB_RT_PORT) {
++                      __u16 feat = le16_to_cpu(ctrl->wValue);
++                      if (feat == USB_PORT_FEAT_RESET) {
++                              usbbk_port_reset(pending_req);
++                              done = 1;
++                      }
++              }
++              break;
++#endif
++      default:
++              break;
++      }
++
++      return done;
++
++fail_response:
++      usbbk_do_response(pending_req, ret, 0, 0, 0);
++      usbif_put(usbif);
++      free_req(pending_req);
++      return 1;
++}
++
++static void dispatch_request_to_pending_reqs(usbif_t *usbif,
++              usbif_urb_request_t *req,
++              pending_req_t *pending_req)
++{
++      int ret;
++
++      pending_req->id = req->id;
++      pending_req->usbif = usbif;
++
++      barrier();
++
++      usbif_get(usbif);
++
++      /* unlink request */
++      if (unlikely(usbif_pipeunlink(req->pipe))) {
++              process_unlink_req(usbif, req, pending_req);
++              return;
++      }
++
++      if (usb_pipecontrol(req->pipe)) {
++              if (check_and_submit_special_ctrlreq(usbif, req, pending_req))
++                      return;
++      } else {
++              int devnum = usb_pipedevice(req->pipe);
++              if (unlikely(!usbif->addr_table[devnum])) {
++                      ret = -ENODEV;
++                      goto fail_response;
++              }
++              pending_req->stub = usbif->addr_table[devnum];
++      }
++
++      barrier();
++
++      ret = usbbk_alloc_urb(req, pending_req);
++      if (ret) {
++              ret = -ESHUTDOWN;
++              goto fail_response;
++      }
++
++      add_req_to_submitting_list(pending_req->stub, pending_req);
++
++      barrier();
++
++      usbbk_init_urb(req, pending_req);
++
++      barrier();
++
++      pending_req->nr_buffer_segs = req->nr_buffer_segs;
++      if (usb_pipeisoc(req->pipe))
++              pending_req->nr_extra_segs = req->u.isoc.nr_frame_desc_segs;
++      else
++              pending_req->nr_extra_segs = 0;
++
++      barrier();
++
++      ret = usbbk_gnttab_map(usbif, req, pending_req);
++      if (ret) {
++              pr_err("usbback: invalid buffer\n");
++              ret = -ESHUTDOWN;
++              goto fail_free_urb;
++      }
++
++      barrier();
++
++      if (usb_pipeout(req->pipe) && req->buffer_length)
++              copy_pages_to_buff(pending_req->buffer,
++                                      pending_req,
++                                      0,
++                                      pending_req->nr_buffer_segs);
++      if (usb_pipeisoc(req->pipe)) {
++              copy_pages_to_buff(&pending_req->urb->iso_frame_desc[0],
++                      pending_req,
++                      pending_req->nr_buffer_segs,
++                      pending_req->nr_extra_segs);
++      }
++
++      barrier();
++
++      ret = usb_submit_urb(pending_req->urb, GFP_KERNEL);
++      if (ret) {
++              pr_err("usbback: failed submitting urb, error %d\n", ret);
++              ret = -ESHUTDOWN;
++              goto fail_flush_area;
++      }
++      return;
++
++fail_flush_area:
++      fast_flush_area(pending_req);
++fail_free_urb:
++      remove_req_from_submitting_list(pending_req->stub, pending_req);
++      barrier();
++      usbbk_free_urb(pending_req->urb);
++fail_response:
++      usbbk_do_response(pending_req, ret, 0, 0, 0);
++      usbif_put(usbif);
++      free_req(pending_req);
++}
++
++static int usbbk_start_submit_urb(usbif_t *usbif)
++{
++      usbif_urb_back_ring_t *urb_ring = &usbif->urb_ring;
++      usbif_urb_request_t *req;
++      pending_req_t *pending_req;
++      RING_IDX rc, rp;
++      int more_to_do = 0;
++
++      rc = urb_ring->req_cons;
++      rp = urb_ring->sring->req_prod;
++      rmb();
++
++      while (rc != rp) {
++              if (RING_REQUEST_CONS_OVERFLOW(urb_ring, rc)) {
++                      pr_warning("RING_REQUEST_CONS_OVERFLOW\n");
++                      break;
++              }
++
++              pending_req = alloc_req();
++              if (NULL == pending_req) {
++                      more_to_do = 1;
++                      break;
++              }
++
++              req = RING_GET_REQUEST(urb_ring, rc);
++              urb_ring->req_cons = ++rc;
++
++              dispatch_request_to_pending_reqs(usbif, req,
++                                                      pending_req);
++      }
++
++      RING_FINAL_CHECK_FOR_REQUESTS(&usbif->urb_ring, more_to_do);
++
++      cond_resched();
++
++      return more_to_do;
++}
++
++void usbbk_hotplug_notify(usbif_t *usbif, int portnum, int speed)
++{
++      usbif_conn_back_ring_t *ring = &usbif->conn_ring;
++      usbif_conn_request_t *req;
++      usbif_conn_response_t *res;
++      unsigned long flags;
++      u16 id;
++      int notify;
++
++      spin_lock_irqsave(&usbif->conn_ring_lock, flags);
++
++      req = RING_GET_REQUEST(ring, ring->req_cons);;
++      id = req->id;
++      ring->req_cons++;
++      ring->sring->req_event = ring->req_cons + 1;
++
++      res = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt);
++      res->id = id;
++      res->portnum = portnum;
++      res->speed = speed;
++      ring->rsp_prod_pvt++;
++      RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(ring, notify);
++
++      spin_unlock_irqrestore(&usbif->conn_ring_lock, flags);
++
++      if (notify)
++              notify_remote_via_irq(usbif->irq);
++}
++
++int usbbk_schedule(void *arg)
++{
++      usbif_t *usbif = (usbif_t *) arg;
++
++      usbif_get(usbif);
++
++      while (!kthread_should_stop()) {
++              wait_event_interruptible(
++                      usbif->wq,
++                      usbif->waiting_reqs || kthread_should_stop());
++              wait_event_interruptible(
++                      pending_free_wq,
++                      !list_empty(&pending_free) || kthread_should_stop());
++              usbif->waiting_reqs = 0;
++              smp_mb();
++
++              if (usbbk_start_submit_urb(usbif))
++                      usbif->waiting_reqs = 1;
++
++              usbbk_free_urbs();
++      }
++
++      usbbk_free_urbs();
++      usbif->xenusbd = NULL;
++      usbif_put(usbif);
++
++      return 0;
++}
++
++/*
++ * attach usbstub device to usbif.
++ */
++void usbbk_attach_device(usbif_t *usbif, struct usbstub *stub)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&usbif->stub_lock, flags);
++      list_add(&stub->dev_list, &usbif->stub_list);
++      spin_unlock_irqrestore(&usbif->stub_lock, flags);
++      stub->usbif = usbif;
++}
++
++/*
++ * detach usbstub device from usbif.
++ */
++void usbbk_detach_device(usbif_t *usbif, struct usbstub *stub)
++{
++      unsigned long flags;
++
++      if (stub->addr)
++              usbbk_set_address(usbif, stub, stub->addr, 0);
++      spin_lock_irqsave(&usbif->stub_lock, flags);
++      list_del(&stub->dev_list);
++      spin_unlock_irqrestore(&usbif->stub_lock, flags);
++      stub->usbif = NULL;
++}
++
++void detach_device_without_lock(usbif_t *usbif, struct usbstub *stub)
++{
++      if (stub->addr)
++              usbbk_set_address(usbif, stub, stub->addr, 0);
++      list_del(&stub->dev_list);
++      stub->usbif = NULL;
++}
++
++static int __init usbback_init(void)
++{
++      int i, mmap_pages;
++      int err = 0;
++
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      mmap_pages = usbif_reqs * USBIF_MAX_SEGMENTS_PER_REQUEST;
++      pending_reqs = kzalloc(sizeof(pending_reqs[0]) *
++                      usbif_reqs, GFP_KERNEL);
++      pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
++                      mmap_pages, GFP_KERNEL);
++      pending_pages = alloc_empty_pages_and_pagevec(mmap_pages);
++
++      if (!pending_reqs || !pending_grant_handles || !pending_pages) {
++              err = -ENOMEM;
++              goto out_mem;
++      }
++
++      for (i = 0; i < mmap_pages; i++)
++              pending_grant_handles[i] = USBBACK_INVALID_HANDLE;
++
++      INIT_LIST_HEAD(&pending_free);
++
++      for (i = 0; i < usbif_reqs; i++)
++              list_add_tail(&pending_reqs[i].free_list, &pending_free);
++
++      err = usbstub_init();
++      if (err)
++              goto out_mem;
++
++      err = usbback_xenbus_init();
++      if (err)
++              goto out_xenbus;
++
++      return 0;
++
++out_xenbus:
++      usbstub_exit();
++out_mem:
++      kfree(pending_reqs);
++      kfree(pending_grant_handles);
++      free_empty_pages_and_pagevec(pending_pages, mmap_pages);
++      return err;
++}
++
++static void __exit usbback_exit(void)
++{
++      usbback_xenbus_exit();
++      usbstub_exit();
++      kfree(pending_reqs);
++      kfree(pending_grant_handles);
++      free_empty_pages_and_pagevec(pending_pages, usbif_reqs * USBIF_MAX_SEGMENTS_PER_REQUEST);
++}
++
++module_init(usbback_init);
++module_exit(usbback_exit);
++
++MODULE_AUTHOR("");
++MODULE_DESCRIPTION("Xen USB backend driver (usbback)");
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/usbback/usbback.h

index 0000000,0000000..8943cc9

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbback/usbback.h
@@@ -1,0 -1,0 +1,171 @@@
++/*
++ * usbback.h
++ *
++ * This file is part of Xen USB backend driver.
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ *
++ * or, by your choice,
++ *
++ * When distributed separately from the Linux kernel or incorporated into
++ * other software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_USBBACK_H__
++#define __XEN_USBBACK_H__
++
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/usb.h>
++#include <linux/vmalloc.h>
++#include <linux/kthread.h>
++#include <linux/wait.h>
++#include <linux/list.h>
++#include <linux/kref.h>
++#include <xen/xenbus.h>
++#include <xen/interface/event_channel.h>
++#include <xen/interface/io/usbif.h>
++
++struct usbstub;
++
++#ifndef BUS_ID_SIZE
++#define USBBACK_BUS_ID_SIZE 20
++#else
++#define USBBACK_BUS_ID_SIZE BUS_ID_SIZE
++#endif
++
++#define USB_DEV_ADDR_SIZE 128
++
++typedef struct usbif_st {
++      domid_t domid;
++      unsigned int handle;
++      int num_ports;
++      enum usb_spec_version usb_ver;
++
++      struct xenbus_device *xbdev;
++      struct list_head usbif_list;
++
++      unsigned int      irq;
++
++      usbif_urb_back_ring_t urb_ring;
++      usbif_conn_back_ring_t conn_ring;
++      struct vm_struct *urb_ring_area;
++      struct vm_struct *conn_ring_area;
++
++      spinlock_t urb_ring_lock;
++      spinlock_t conn_ring_lock;
++      atomic_t refcnt;
++
++      struct xenbus_watch backend_watch;
++
++      /* device address lookup table */
++      struct usbstub *addr_table[USB_DEV_ADDR_SIZE];
++      spinlock_t addr_lock;
++
++      /* connected device list */
++      struct list_head stub_list;
++      spinlock_t stub_lock;
++
++      /* request schedule */
++      struct task_struct *xenusbd;
++      unsigned int waiting_reqs;
++      wait_queue_head_t waiting_to_free;
++      wait_queue_head_t wq;
++} usbif_t;
++
++struct vusb_port_id {
++      struct list_head id_list;
++
++      char phys_bus[USBBACK_BUS_ID_SIZE];
++      domid_t domid;
++      unsigned int handle;
++      int portnum;
++      unsigned is_connected:1;
++};
++
++struct usbstub {
++      struct kref kref;
++      struct list_head dev_list;
++
++      struct vusb_port_id *portid;
++      struct usb_device *udev;
++      usbif_t *usbif;
++      int addr;
++
++      struct list_head submitting_list;
++      spinlock_t submitting_lock;
++};
++
++usbif_t *usbif_alloc(domid_t domid, unsigned int handle);
++void usbif_disconnect(usbif_t *usbif);
++void usbif_free(usbif_t *usbif);
++int usbif_map(usbif_t *usbif, grant_ref_t urb_ring_ref,
++            grant_ref_t conn_ring_ref, evtchn_port_t);
++
++#define usbif_get(_b) (atomic_inc(&(_b)->refcnt))
++#define usbif_put(_b) \
++      do { \
++              if (atomic_dec_and_test(&(_b)->refcnt)) \
++                      wake_up(&(_b)->waiting_to_free); \
++      } while (0)
++
++usbif_t *find_usbif(domid_t domid, unsigned int handle);
++int usbback_xenbus_init(void);
++void usbback_xenbus_exit(void);
++struct vusb_port_id *find_portid_by_busid(const char *busid);
++struct vusb_port_id *find_portid(const domid_t domid,
++                                              const unsigned int handle,
++                                              const int portnum);
++int portid_add(const char *busid,
++                                      const domid_t domid,
++                                      const unsigned int handle,
++                                      const int portnum);
++int portid_remove(const domid_t domid,
++                                      const unsigned int handle,
++                                      const int portnum);
++irqreturn_t usbbk_be_int(int irq, void *dev_id);
++int usbbk_schedule(void *arg);
++struct usbstub *find_attached_device(usbif_t *usbif, int port);
++void usbbk_attach_device(usbif_t *usbif, struct usbstub *stub);
++void usbbk_detach_device(usbif_t *usbif, struct usbstub *stub);
++void usbbk_hotplug_notify(usbif_t *usbif, int portnum, int speed);
++void detach_device_without_lock(usbif_t *usbif, struct usbstub *stub);
++void usbbk_unlink_urbs(struct usbstub *stub);
++
++int usbstub_init(void);
++void usbstub_exit(void);
++
++#endif /* __XEN_USBBACK_H__ */
diff --cc drivers/xen/usbback/usbstub.c

index 0000000,0000000..fc6dd44

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbback/usbstub.c
@@@ -1,0 -1,0 +1,324 @@@
++/*
++ * usbstub.c
++ *
++ * USB stub driver - grabbing and managing USB devices.
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ *
++ * or, by your choice,
++ *
++ * When distributed separately from the Linux kernel or incorporated into
++ * other software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#include "usbback.h"
++
++static LIST_HEAD(port_list);
++static DEFINE_SPINLOCK(port_list_lock);
++
++struct vusb_port_id *find_portid_by_busid(const char *busid)
++{
++      struct vusb_port_id *portid;
++      int found = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&port_list_lock, flags);
++      list_for_each_entry(portid, &port_list, id_list) {
++              if (!(strncmp(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE))) {
++                      found = 1;
++                      break;
++              }
++      }
++      spin_unlock_irqrestore(&port_list_lock, flags);
++
++      if (found)
++              return portid;
++
++      return NULL;
++}
++
++struct vusb_port_id *find_portid(const domid_t domid,
++                                              const unsigned int handle,
++                                              const int portnum)
++{
++      struct vusb_port_id *portid;
++      int found = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&port_list_lock, flags);
++      list_for_each_entry(portid, &port_list, id_list) {
++              if ((portid->domid == domid)
++                              && (portid->handle == handle)
++                              && (portid->portnum == portnum)) {
++                              found = 1;
++                              break;
++              }
++      }
++      spin_unlock_irqrestore(&port_list_lock, flags);
++
++      if (found)
++              return portid;
++
++      return NULL;
++}
++
++int portid_add(const char *busid,
++                                      const domid_t domid,
++                                      const unsigned int handle,
++                                      const int portnum)
++{
++      struct vusb_port_id *portid;
++      unsigned long flags;
++
++      portid = kzalloc(sizeof(*portid), GFP_KERNEL);
++      if (!portid)
++              return -ENOMEM;
++
++      portid->domid = domid;
++      portid->handle = handle;
++      portid->portnum = portnum;
++
++      strncpy(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE);
++
++      spin_lock_irqsave(&port_list_lock, flags);
++      list_add(&portid->id_list, &port_list);
++      spin_unlock_irqrestore(&port_list_lock, flags);
++
++      return 0;
++}
++
++int portid_remove(const domid_t domid,
++                                      const unsigned int handle,
++                                      const int portnum)
++{
++      struct vusb_port_id *portid, *tmp;
++      int err = -ENOENT;
++      unsigned long flags;
++
++      spin_lock_irqsave(&port_list_lock, flags);
++      list_for_each_entry_safe(portid, tmp, &port_list, id_list) {
++              if (portid->domid == domid
++                              && portid->handle == handle
++                              && portid->portnum == portnum) {
++                      list_del(&portid->id_list);
++                      kfree(portid);
++
++                      err = 0;
++              }
++      }
++      spin_unlock_irqrestore(&port_list_lock, flags);
++
++      return err;
++}
++
++static struct usbstub *usbstub_alloc(struct usb_device *udev,
++                                              struct vusb_port_id *portid)
++{
++      struct usbstub *stub;
++
++      stub = kzalloc(sizeof(*stub), GFP_KERNEL);
++      if (!stub) {
++              pr_err("no memory for usbstub\n");
++              return NULL;
++      }
++      kref_init(&stub->kref);
++      stub->udev = usb_get_dev(udev);
++      stub->portid = portid;
++      spin_lock_init(&stub->submitting_lock);
++      INIT_LIST_HEAD(&stub->submitting_list);
++
++      return stub;
++}
++
++static void usbstub_release(struct kref *kref)
++{
++      struct usbstub *stub;
++
++      stub = container_of(kref, struct usbstub, kref);
++
++      usb_put_dev(stub->udev);
++      stub->udev = NULL;
++      stub->portid = NULL;
++      kfree(stub);
++}
++
++static inline void usbstub_get(struct usbstub *stub)
++{
++      kref_get(&stub->kref);
++}
++
++static inline void usbstub_put(struct usbstub *stub)
++{
++      kref_put(&stub->kref, usbstub_release);
++}
++
++static int usbstub_probe(struct usb_interface *intf,
++              const struct usb_device_id *id)
++{
++      struct usb_device *udev = interface_to_usbdev(intf);
++      const char *busid = dev_name(intf->dev.parent);
++      struct vusb_port_id *portid = NULL;
++      struct usbstub *stub = NULL;
++      usbif_t *usbif = NULL;
++      int retval = -ENODEV;
++
++      /* hub currently not supported, so skip. */
++      if (udev->descriptor.bDeviceClass ==  USB_CLASS_HUB)
++              goto out;
++
++      portid = find_portid_by_busid(busid);
++      if (!portid)
++              goto out;
++
++      usbif = find_usbif(portid->domid, portid->handle);
++      if (!usbif)
++              goto out;
++
++      switch (udev->speed) {
++      case USB_SPEED_LOW:
++      case USB_SPEED_FULL:
++              break;
++      case USB_SPEED_HIGH:
++              if (usbif->usb_ver >= USB_VER_USB20)
++                      break;
++              /* fall through */
++      default:
++              goto out;
++      }
++
++      stub = find_attached_device(usbif, portid->portnum);
++      if (!stub) {
++              /* new connection */
++              stub = usbstub_alloc(udev, portid);
++              if (!stub)
++                      return -ENOMEM;
++              usbbk_attach_device(usbif, stub);
++              usbbk_hotplug_notify(usbif, portid->portnum, udev->speed);
++      } else {
++              /* maybe already called and connected by other intf */
++              if (strncmp(stub->portid->phys_bus, busid, USBBACK_BUS_ID_SIZE))
++                      goto out; /* invalid call */
++      }
++
++      usbstub_get(stub);
++      usb_set_intfdata(intf, stub);
++      retval = 0;
++
++out:
++      return retval;
++}
++
++static void usbstub_disconnect(struct usb_interface *intf)
++{
++      struct usbstub *stub
++              = (struct usbstub *) usb_get_intfdata(intf);
++
++      usb_set_intfdata(intf, NULL);
++
++      if (!stub)
++              return;
++
++      if (stub->usbif) {
++              usbbk_hotplug_notify(stub->usbif, stub->portid->portnum, 0);
++              usbbk_detach_device(stub->usbif, stub);
++      }
++      usbbk_unlink_urbs(stub);
++      usbstub_put(stub);
++}
++
++static ssize_t usbstub_show_portids(struct device_driver *driver,
++              char *buf)
++{
++      struct vusb_port_id *portid;
++      size_t count = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&port_list_lock, flags);
++      list_for_each_entry(portid, &port_list, id_list) {
++              if (count >= PAGE_SIZE)
++                      break;
++              count += scnprintf((char *)buf + count, PAGE_SIZE - count,
++                              "%s:%d:%d:%d\n",
++                              &portid->phys_bus[0],
++                              portid->domid,
++                              portid->handle,
++                              portid->portnum);
++      }
++      spin_unlock_irqrestore(&port_list_lock, flags);
++
++      return count;
++}
++static DRIVER_ATTR(port_ids, S_IRUSR, usbstub_show_portids, NULL);
++
++/* table of devices that matches any usbdevice */
++static const struct usb_device_id usbstub_table[] = {
++              { .driver_info = 1 }, /* wildcard, see usb_match_id() */
++              { } /* Terminating entry */
++};
++MODULE_DEVICE_TABLE(usb, usbstub_table);
++
++static struct usb_driver usbback_usb_driver = {
++              .name = "usbback",
++              .probe = usbstub_probe,
++              .disconnect = usbstub_disconnect,
++              .id_table = usbstub_table,
++              .no_dynamic_id = 1,
++};
++
++int __init usbstub_init(void)
++{
++      int err;
++
++      err = usb_register(&usbback_usb_driver);
++      if (err < 0) {
++              pr_err("usbback: usb_register failed (%d)\n", err);
++              goto out;
++      }
++
++      err = driver_create_file(&usbback_usb_driver.drvwrap.driver,
++                              &driver_attr_port_ids);
++      if (err)
++              usb_deregister(&usbback_usb_driver);
++
++out:
++      return err;
++}
++
++void usbstub_exit(void)
++{
++      driver_remove_file(&usbback_usb_driver.drvwrap.driver,
++                              &driver_attr_port_ids);
++      usb_deregister(&usbback_usb_driver);
++}
diff --cc drivers/xen/usbback/xenbus.c

index 0000000,0000000..42457ef

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbback/xenbus.c
@@@ -1,0 -1,0 +1,338 @@@
++/*
++ * xenbus.c
++ *
++ * Xenbus interface for USB backend driver.
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ *
++ * or, by your choice,
++ *
++ * When distributed separately from the Linux kernel or incorporated into
++ * other software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#include "usbback.h"
++
++static int start_xenusbd(usbif_t *usbif)
++{
++      int err = 0;
++      char name[TASK_COMM_LEN];
++
++      snprintf(name, TASK_COMM_LEN, "usbback.%d.%d", usbif->domid,
++                      usbif->handle);
++      usbif->xenusbd = kthread_run(usbbk_schedule, usbif, name);
++      if (IS_ERR(usbif->xenusbd)) {
++              err = PTR_ERR(usbif->xenusbd);
++              usbif->xenusbd = NULL;
++              xenbus_dev_error(usbif->xbdev, err, "start xenusbd");
++      }
++
++      return err;
++}
++
++static void backend_changed(struct xenbus_watch *watch,
++                      const char **vec, unsigned int len)
++{
++      struct xenbus_transaction xbt;
++      int err;
++      int i;
++      char node[8];
++      char *busid;
++      struct vusb_port_id *portid = NULL;
++
++      usbif_t *usbif = container_of(watch, usbif_t, backend_watch);
++      struct xenbus_device *dev = usbif->xbdev;
++
++again:
++      err = xenbus_transaction_start(&xbt);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "starting transaction");
++              return;
++      }
++
++      for (i = 1; i <= usbif->num_ports; i++) {
++              sprintf(node, "port/%d", i);
++              busid = xenbus_read(xbt, dev->nodename, node, NULL);
++              if (IS_ERR(busid)) {
++                      err = PTR_ERR(busid);
++                      xenbus_dev_fatal(dev, err, "reading port/%d", i);
++                      goto abort;
++              }
++
++              /*
++               * remove portid, if the port is not connected,
++               */
++              if (strlen(busid) == 0) {
++                      portid = find_portid(usbif->domid, usbif->handle, i);
++                      if (portid) {
++                              if (portid->is_connected)
++                                      xenbus_dev_fatal(dev, err,
++                                              "can't remove port/%d, unbind first", i);
++                              else
++                                      portid_remove(usbif->domid, usbif->handle, i);
++                      }
++                      continue; /* never configured, ignore */
++              }
++
++              /*
++               * add portid,
++               * if the port is not configured and not used from other usbif.
++               */
++              portid = find_portid(usbif->domid, usbif->handle, i);
++              if (portid) {
++                      if ((strncmp(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE)))
++                              xenbus_dev_fatal(dev, err,
++                                      "can't add port/%d, remove first", i);
++                      else
++                              continue; /* already configured, ignore */
++              } else {
++                      if (find_portid_by_busid(busid))
++                              xenbus_dev_fatal(dev, err,
++                                      "can't add port/%d, busid already used", i);
++                      else
++                              portid_add(busid, usbif->domid, usbif->handle, i);
++              }
++      }
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err == -EAGAIN)
++              goto again;
++      if (err)
++              xenbus_dev_fatal(dev, err, "completing transaction");
++
++      return;
++
++abort:
++      xenbus_transaction_end(xbt, 1);
++
++      return;
++}
++
++static int usbback_remove(struct xenbus_device *dev)
++{
++      usbif_t *usbif = dev_get_drvdata(&dev->dev);
++      int i;
++
++      if (usbif->backend_watch.node) {
++              unregister_xenbus_watch(&usbif->backend_watch);
++              kfree(usbif->backend_watch.node);
++              usbif->backend_watch.node = NULL;
++      }
++
++      if (usbif) {
++              /* remove all ports */
++              for (i = 1; i <= usbif->num_ports; i++)
++                      portid_remove(usbif->domid, usbif->handle, i);
++              usbif_disconnect(usbif);
++              usbif_free(usbif);;
++      }
++      dev_set_drvdata(&dev->dev, NULL);
++
++      return 0;
++}
++
++static int usbback_probe(struct xenbus_device *dev,
++                        const struct xenbus_device_id *id)
++{
++      usbif_t *usbif;
++      unsigned int handle;
++      int num_ports;
++      int usb_ver;
++      int err;
++
++      if (usb_disabled())
++              return -ENODEV;
++
++      handle = simple_strtoul(strrchr(dev->otherend, '/') + 1, NULL, 0);
++      usbif = usbif_alloc(dev->otherend_id, handle);
++      if (!usbif) {
++              xenbus_dev_fatal(dev, -ENOMEM, "allocating backend interface");
++              return -ENOMEM;
++      }
++      usbif->xbdev = dev;
++      dev_set_drvdata(&dev->dev, usbif);
++
++      err = xenbus_scanf(XBT_NIL, dev->nodename,
++                              "num-ports", "%d", &num_ports);
++      if (err != 1) {
++              xenbus_dev_fatal(dev, err, "reading num-ports");
++              goto fail;
++      }
++      if (num_ports < 1 || num_ports > USB_MAXCHILDREN) {
++              xenbus_dev_fatal(dev, err, "invalid num-ports");
++              goto fail;
++      }
++      usbif->num_ports = num_ports;
++
++      err = xenbus_scanf(XBT_NIL, dev->nodename,
++                              "usb-ver", "%d", &usb_ver);
++      if (err != 1) {
++              xenbus_dev_fatal(dev, err, "reading usb-ver");
++              goto fail;
++      }
++      switch (usb_ver) {
++      case USB_VER_USB11:
++      case USB_VER_USB20:
++              usbif->usb_ver = usb_ver;
++              break;
++      default:
++              xenbus_dev_fatal(dev, err, "invalid usb-ver");
++              goto fail;
++      }
++
++      err = xenbus_switch_state(dev, XenbusStateInitWait);
++      if (err)
++              goto fail;
++
++      return 0;
++
++fail:
++      usbback_remove(dev);
++      return err;
++}
++
++static int connect_rings(usbif_t *usbif)
++{
++      struct xenbus_device *dev = usbif->xbdev;
++      unsigned long urb_ring_ref;
++      unsigned long conn_ring_ref;
++      unsigned int evtchn;
++      int err;
++
++      err = xenbus_gather(XBT_NIL, dev->otherend,
++                          "urb-ring-ref", "%lu", &urb_ring_ref,
++                          "conn-ring-ref", "%lu", &conn_ring_ref,
++                          "event-channel", "%u", &evtchn, NULL);
++      if (err) {
++              xenbus_dev_fatal(dev, err,
++                               "reading %s/ring-ref and event-channel",
++                               dev->otherend);
++              return err;
++      }
++
++      pr_info("usbback: urb-ring-ref %ld, conn-ring-ref %ld,"
++              " event-channel %d\n",
++              urb_ring_ref, conn_ring_ref, evtchn);
++
++      err = usbif_map(usbif, urb_ring_ref, conn_ring_ref, evtchn);
++      if (err) {
++              xenbus_dev_fatal(dev, err,
++                              "mapping urb-ring-ref %lu conn-ring-ref %lu port %u",
++                              urb_ring_ref, conn_ring_ref, evtchn);
++              return err;
++      }
++
++      return 0;
++}
++
++static void frontend_changed(struct xenbus_device *dev,
++                                   enum xenbus_state frontend_state)
++{
++      usbif_t *usbif = dev_get_drvdata(&dev->dev);
++      int err;
++
++      switch (frontend_state) {
++      case XenbusStateInitialised:
++      case XenbusStateReconfiguring:
++      case XenbusStateReconfigured:
++              break;
++
++      case XenbusStateInitialising:
++              if (dev->state == XenbusStateClosed) {
++                      pr_info("%s: %s: prepare for reconnect\n",
++                              __FUNCTION__, dev->nodename);
++                      xenbus_switch_state(dev, XenbusStateInitWait);
++              }
++              break;
++
++      case XenbusStateConnected:
++              if (dev->state == XenbusStateConnected)
++                      break;
++              err = connect_rings(usbif);
++              if (err)
++                      break;
++              err = start_xenusbd(usbif);
++              if (err)
++                      break;
++              err = xenbus_watch_path2(dev, dev->nodename, "port",
++                                      &usbif->backend_watch, backend_changed);
++              if (err)
++                      break;
++              xenbus_switch_state(dev, XenbusStateConnected);
++              break;
++
++      case XenbusStateClosing:
++              usbif_disconnect(usbif);
++              xenbus_switch_state(dev, XenbusStateClosing);
++              break;
++
++      case XenbusStateClosed:
++              xenbus_switch_state(dev, XenbusStateClosed);
++              if (xenbus_dev_is_online(dev))
++                      break;
++              /* fall through if not online */
++      case XenbusStateUnknown:
++              device_unregister(&dev->dev);
++              break;
++
++      default:
++              xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++                               frontend_state);
++              break;
++      }
++}
++
++static const struct xenbus_device_id usbback_ids[] = {
++      { "vusb" },
++      { "" },
++};
++
++static struct xenbus_driver usbback_driver = {
++      .name = "vusb",
++      .ids = usbback_ids,
++      .probe = usbback_probe,
++      .otherend_changed = frontend_changed,
++      .remove = usbback_remove,
++};
++
++int __init usbback_xenbus_init(void)
++{
++      return xenbus_register_backend(&usbback_driver);
++}
++
++void __exit usbback_xenbus_exit(void)
++{
++      xenbus_unregister_driver(&usbback_driver);
++}
diff --cc drivers/xen/usbfront/Makefile

index 0000000,0000000..034ba96

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbfront/Makefile
@@@ -1,0 -1,0 +1,11 @@@
++obj-$(CONFIG_XEN_USB_FRONTEND) := xen-hcd.o
++
++xen-hcd-y   := usbfront-hcd.o xenbus.o
++
++ifeq ($(CONFIG_XEN_USB_FRONTEND_HCD_STATS),y)
++EXTRA_CFLAGS += -DXENHCD_STATS
++endif
++
++ifeq ($(CONFIG_XEN_USB_FRONTEND_HCD_PM),y)
++EXTRA_CFLAGS += -DXENHCD_PM
++endif
diff --cc drivers/xen/usbfront/usbfront-dbg.c

index 0000000,0000000..647e3fe

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-dbg.c
@@@ -1,0 -1,0 +1,101 @@@
++/*
++ * usbfront-dbg.c
++ *
++ * Xen USB Virtual Host Controller - debugging
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ *
++ * or, by your choice,
++ *
++ * When distributed separately from the Linux kernel or incorporated into
++ * other software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++static ssize_t show_statistics(struct device *dev,
++                             struct device_attribute *attr, char *buf)
++{
++      struct usb_hcd *hcd;
++      struct usbfront_info *info;
++      unsigned long flags;
++      unsigned temp, size;
++      char *next;
++
++      hcd = dev_get_drvdata(dev);
++      info = hcd_to_info(hcd);
++      next = buf;
++      size = PAGE_SIZE;
++
++      spin_lock_irqsave(&info->lock, flags);
++
++      temp = scnprintf(next, size,
++                      "bus %s, device %s\n"
++                      "%s\n"
++                      "xenhcd, hcd state %d\n",
++                      hcd->self.controller->bus->name,
++                      dev_name(hcd->self.controller),
++                      hcd->product_desc,
++                      hcd->state);
++      size -= temp;
++      next += temp;
++
++#ifdef XENHCD_STATS
++      temp = scnprintf(next, size,
++              "complete %ld unlink %ld ring_full %ld\n",
++              info->stats.complete, info->stats.unlink,
++              info->stats.ring_full);
++      size -= temp;
++      next += temp;
++#endif
++
++      spin_unlock_irqrestore(&info->lock, flags);
++
++      return PAGE_SIZE - size;
++}
++
++static DEVICE_ATTR(statistics, S_IRUGO, show_statistics, NULL);
++
++static inline void create_debug_file(struct usbfront_info *info)
++{
++      struct device *dev = info_to_hcd(info)->self.controller;
++      if (device_create_file(dev, &dev_attr_statistics))
++              pr_warning("statistics file not created for %s\n",
++                         info_to_hcd(info)->self.bus_name);
++}
++
++static inline void remove_debug_file(struct usbfront_info *info)
++{
++      struct device *dev = info_to_hcd(info)->self.controller;
++      device_remove_file(dev, &dev_attr_statistics);
++}
diff --cc drivers/xen/usbfront/usbfront-hcd.c

index 0000000,0000000..83c469b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-hcd.c
@@@ -1,0 -1,0 +1,232 @@@
++/*
++ * usbfront-hcd.c
++ *
++ * Xen USB Virtual Host Controller driver
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ *
++ * or, by your choice,
++ *
++ * When distributed separately from the Linux kernel or incorporated into
++ * other software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#include "usbfront.h"
++#include "usbfront-dbg.c"
++#include "usbfront-hub.c"
++#include "usbfront-q.c"
++
++static void xenhcd_watchdog(unsigned long param)
++{
++      struct usbfront_info *info = (struct usbfront_info *) param;
++      unsigned long flags;
++
++      spin_lock_irqsave(&info->lock, flags);
++      if (likely(HC_IS_RUNNING(info_to_hcd(info)->state))) {
++              timer_action_done(info, TIMER_RING_WATCHDOG);
++              xenhcd_giveback_unlinked_urbs(info);
++              xenhcd_kick_pending_urbs(info);
++      }
++      spin_unlock_irqrestore(&info->lock, flags);
++}
++
++/*
++ * one-time HC init
++ */
++static int xenhcd_setup(struct usb_hcd *hcd)
++{
++      struct usbfront_info *info = hcd_to_info(hcd);
++
++      spin_lock_init(&info->lock);
++      INIT_LIST_HEAD(&info->pending_submit_list);
++      INIT_LIST_HEAD(&info->pending_unlink_list);
++      INIT_LIST_HEAD(&info->in_progress_list);
++      INIT_LIST_HEAD(&info->giveback_waiting_list);
++      init_timer(&info->watchdog);
++      info->watchdog.function = xenhcd_watchdog;
++      info->watchdog.data = (unsigned long) info;
++      return 0;
++}
++
++/*
++ * start HC running
++ */
++static int xenhcd_run(struct usb_hcd *hcd)
++{
++      hcd->uses_new_polling = 1;
++      clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
++      hcd->state = HC_STATE_RUNNING;
++      create_debug_file(hcd_to_info(hcd));
++      return 0;
++}
++
++/*
++ * stop running HC
++ */
++static void xenhcd_stop(struct usb_hcd *hcd)
++{
++      struct usbfront_info *info = hcd_to_info(hcd);
++
++      del_timer_sync(&info->watchdog);
++      remove_debug_file(info);
++      spin_lock_irq(&info->lock);
++      /* cancel all urbs */
++      hcd->state = HC_STATE_HALT;
++      xenhcd_cancel_all_enqueued_urbs(info);
++      xenhcd_giveback_unlinked_urbs(info);
++      spin_unlock_irq(&info->lock);
++}
++
++/*
++ * called as .urb_enqueue()
++ * non-error returns are promise to giveback the urb later
++ */
++static int xenhcd_urb_enqueue(struct usb_hcd *hcd,
++                                  struct urb *urb,
++                                  gfp_t mem_flags)
++{
++      struct usbfront_info *info = hcd_to_info(hcd);
++      struct urb_priv *urbp;
++      unsigned long flags;
++      int ret = 0;
++
++      spin_lock_irqsave(&info->lock, flags);
++
++      urbp = alloc_urb_priv(urb);
++      if (!urbp) {
++              ret = -ENOMEM;
++              goto done;
++      }
++      urbp->status = 1;
++
++      ret = xenhcd_submit_urb(info, urbp);
++      if (ret != 0)
++              free_urb_priv(urbp);
++
++done:
++      spin_unlock_irqrestore(&info->lock, flags);
++      return ret;
++}
++
++/*
++ * called as .urb_dequeue()
++ */
++static int xenhcd_urb_dequeue(struct usb_hcd *hcd,
++                            struct urb *urb, int status)
++{
++      struct usbfront_info *info = hcd_to_info(hcd);
++      struct urb_priv *urbp;
++      unsigned long flags;
++      int ret = 0;
++
++      spin_lock_irqsave(&info->lock, flags);
++
++      urbp = urb->hcpriv;
++      if (!urbp)
++              goto done;
++
++      urbp->status = status;
++      ret = xenhcd_unlink_urb(info, urbp);
++
++done:
++      spin_unlock_irqrestore(&info->lock, flags);
++      return ret;
++}
++
++/*
++ * called from usb_get_current_frame_number(),
++ * but, almost all drivers not use such function.
++ */
++static int xenhcd_get_frame(struct usb_hcd *hcd)
++{
++      /* it means error, but probably no problem :-) */
++      return 0;
++}
++
++static const char hcd_name[] = "xen_hcd";
++
++struct hc_driver xen_usb20_hc_driver = {
++      .description = hcd_name,
++      .product_desc = "Xen USB2.0 Virtual Host Controller",
++      .hcd_priv_size = sizeof(struct usbfront_info),
++      .flags = HCD_USB2,
++
++      /* basic HC lifecycle operations */
++      .reset = xenhcd_setup,
++      .start = xenhcd_run,
++      .stop = xenhcd_stop,
++
++      /* managing urb I/O */
++      .urb_enqueue = xenhcd_urb_enqueue,
++      .urb_dequeue = xenhcd_urb_dequeue,
++      .get_frame_number = xenhcd_get_frame,
++
++      /* root hub operations */
++      .hub_status_data = xenhcd_hub_status_data,
++      .hub_control = xenhcd_hub_control,
++#ifdef XENHCD_PM
++#ifdef CONFIG_PM
++      .bus_suspend = xenhcd_bus_suspend,
++      .bus_resume = xenhcd_bus_resume,
++#endif
++#endif
++};
++
++struct hc_driver xen_usb11_hc_driver = {
++      .description = hcd_name,
++      .product_desc = "Xen USB1.1 Virtual Host Controller",
++      .hcd_priv_size = sizeof(struct usbfront_info),
++      .flags = HCD_USB11,
++
++      /* basic HC lifecycle operations */
++      .reset = xenhcd_setup,
++      .start = xenhcd_run,
++      .stop = xenhcd_stop,
++
++      /* managing urb I/O */
++      .urb_enqueue = xenhcd_urb_enqueue,
++      .urb_dequeue = xenhcd_urb_dequeue,
++      .get_frame_number = xenhcd_get_frame,
++
++      /* root hub operations */
++      .hub_status_data = xenhcd_hub_status_data,
++      .hub_control = xenhcd_hub_control,
++#ifdef XENHCD_PM
++#ifdef CONFIG_PM
++      .bus_suspend = xenhcd_bus_suspend,
++      .bus_resume = xenhcd_bus_resume,
++#endif
++#endif
++};
diff --cc drivers/xen/usbfront/usbfront-hub.c

index 0000000,0000000..1a0bfa3

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-hub.c
@@@ -1,0 -1,0 +1,471 @@@
++/*
++ * usbfront-hub.c
++ *
++ * Xen USB Virtual Host Controller - Root Hub Emulations
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ *
++ * or, by your choice,
++ *
++ * When distributed separately from the Linux kernel or incorporated into
++ * other software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++/*
++ * set virtual port connection status
++ */
++void set_connect_state(struct usbfront_info *info, int portnum)
++{
++      int port;
++
++      port = portnum - 1;
++      if (info->ports[port].status & USB_PORT_STAT_POWER) {
++              switch (info->devices[port].speed) {
++              case USB_SPEED_UNKNOWN:
++                      info->ports[port].status &=
++                              ~(USB_PORT_STAT_CONNECTION |
++                                      USB_PORT_STAT_ENABLE |
++                                      USB_PORT_STAT_LOW_SPEED |
++                                      USB_PORT_STAT_HIGH_SPEED |
++                                      USB_PORT_STAT_SUSPEND);
++                      break;
++              case USB_SPEED_LOW:
++                      info->ports[port].status |= USB_PORT_STAT_CONNECTION;
++                      info->ports[port].status |= USB_PORT_STAT_LOW_SPEED;
++                      break;
++              case USB_SPEED_FULL:
++                      info->ports[port].status |= USB_PORT_STAT_CONNECTION;
++                      break;
++              case USB_SPEED_HIGH:
++                      info->ports[port].status |= USB_PORT_STAT_CONNECTION;
++                      info->ports[port].status |= USB_PORT_STAT_HIGH_SPEED;
++                      break;
++              default: /* error */
++                      return;
++              }
++              info->ports[port].status |= (USB_PORT_STAT_C_CONNECTION << 16);
++      }
++}
++
++/*
++ * set virtual device connection status
++ */
++void rhport_connect(struct usbfront_info *info,
++                              int portnum, enum usb_device_speed speed)
++{
++      int port;
++
++      if (portnum < 1 || portnum > info->rh_numports)
++              return; /* invalid port number */
++
++      port = portnum - 1;
++      if (info->devices[port].speed != speed) {
++              switch (speed) {
++              case USB_SPEED_UNKNOWN: /* disconnect */
++                      info->devices[port].status = USB_STATE_NOTATTACHED;
++                      break;
++              case USB_SPEED_LOW:
++              case USB_SPEED_FULL:
++              case USB_SPEED_HIGH:
++                      info->devices[port].status = USB_STATE_ATTACHED;
++                      break;
++              default: /* error */
++                      return;
++              }
++              info->devices[port].speed = speed;
++              info->ports[port].c_connection = 1;
++
++              set_connect_state(info, portnum);
++      }
++}
++
++/*
++ * SetPortFeature(PORT_SUSPENDED)
++ */
++void rhport_suspend(struct usbfront_info *info, int portnum)
++{
++      int port;
++
++      port = portnum - 1;
++      info->ports[port].status |= USB_PORT_STAT_SUSPEND;
++      info->devices[port].status = USB_STATE_SUSPENDED;
++}
++
++/*
++ * ClearPortFeature(PORT_SUSPENDED)
++ */
++void rhport_resume(struct usbfront_info *info, int portnum)
++{
++      int port;
++
++      port = portnum - 1;
++      if (info->ports[port].status & USB_PORT_STAT_SUSPEND) {
++              info->ports[port].resuming = 1;
++              info->ports[port].timeout = jiffies + msecs_to_jiffies(20);
++      }
++}
++
++/*
++ * SetPortFeature(PORT_POWER)
++ */
++void rhport_power_on(struct usbfront_info *info, int portnum)
++{
++      int port;
++
++      port = portnum - 1;
++      if ((info->ports[port].status & USB_PORT_STAT_POWER) == 0) {
++              info->ports[port].status |= USB_PORT_STAT_POWER;
++              if (info->devices[port].status != USB_STATE_NOTATTACHED)
++                      info->devices[port].status = USB_STATE_POWERED;
++              if (info->ports[port].c_connection)
++                      set_connect_state(info, portnum);
++      }
++}
++
++/*
++ * ClearPortFeature(PORT_POWER)
++ * SetConfiguration(non-zero)
++ * Power_Source_Off
++ * Over-current
++ */
++void rhport_power_off(struct usbfront_info *info, int portnum)
++{
++      int port;
++
++      port = portnum - 1;
++      if (info->ports[port].status & USB_PORT_STAT_POWER) {
++              info->ports[port].status = 0;
++              if (info->devices[port].status != USB_STATE_NOTATTACHED)
++                      info->devices[port].status = USB_STATE_ATTACHED;
++      }
++}
++
++/*
++ * ClearPortFeature(PORT_ENABLE)
++ */
++void rhport_disable(struct usbfront_info *info, int portnum)
++{
++      int port;
++
++      port = portnum - 1;
++      info->ports[port].status &= ~USB_PORT_STAT_ENABLE;
++      info->ports[port].status &= ~USB_PORT_STAT_SUSPEND;
++      info->ports[port].resuming = 0;
++      if (info->devices[port].status != USB_STATE_NOTATTACHED)
++              info->devices[port].status = USB_STATE_POWERED;
++}
++
++/*
++ * SetPortFeature(PORT_RESET)
++ */
++void rhport_reset(struct usbfront_info *info, int portnum)
++{
++      int port;
++
++      port = portnum - 1;
++      info->ports[port].status &= ~(USB_PORT_STAT_ENABLE
++                                      | USB_PORT_STAT_LOW_SPEED
++                                      | USB_PORT_STAT_HIGH_SPEED);
++      info->ports[port].status |= USB_PORT_STAT_RESET;
++
++      if (info->devices[port].status != USB_STATE_NOTATTACHED)
++              info->devices[port].status = USB_STATE_ATTACHED;
++
++      /* 10msec reset signaling */
++      info->ports[port].timeout = jiffies + msecs_to_jiffies(10);
++}
++
++#ifdef XENHCD_PM
++#ifdef CONFIG_PM
++static int xenhcd_bus_suspend(struct usb_hcd *hcd)
++{
++      struct usbfront_info *info = hcd_to_info(hcd);
++      int ret = 0;
++      int i, ports;
++
++      ports = info->rh_numports;
++
++      spin_lock_irq(&info->lock);
++      if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
++              ret = -ESHUTDOWN;
++      else {
++              /* suspend any active ports*/
++              for (i = 1; i <= ports; i++)
++                      rhport_suspend(info, i);
++      }
++      spin_unlock_irq(&info->lock);
++
++      del_timer_sync(&info->watchdog);
++
++      return ret;
++}
++
++static int xenhcd_bus_resume(struct usb_hcd *hcd)
++{
++      struct usbfront_info *info = hcd_to_info(hcd);
++      int ret = 0;
++      int i, ports;
++
++      ports = info->rh_numports;
++
++      spin_lock_irq(&info->lock);
++      if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
++              ret = -ESHUTDOWN;
++      else {
++              /* resume any suspended ports*/
++              for (i = 1; i <= ports; i++)
++                      rhport_resume(info, i);
++      }
++      spin_unlock_irq(&info->lock);
++
++      return ret;
++}
++#endif
++#endif
++
++static void xenhcd_hub_descriptor(struct usbfront_info *info,
++                                struct usb_hub_descriptor *desc)
++{
++      u16 temp;
++      int ports = info->rh_numports;
++
++      desc->bDescriptorType = 0x29;
++      desc->bPwrOn2PwrGood = 10; /* EHCI says 20ms max */
++      desc->bHubContrCurrent = 0;
++      desc->bNbrPorts = ports;
++
++      /* size of DeviceRemovable and PortPwrCtrlMask fields*/
++      temp = 1 + (ports / 8);
++      desc->bDescLength = 7 + 2 * temp;
++
++      /* bitmaps for DeviceRemovable and PortPwrCtrlMask */
++      memset(&desc->u.hs.DeviceRemovable[0], 0, temp);
++      memset(&desc->u.hs.DeviceRemovable[temp], 0xff, temp);
++
++      /* per-port over current reporting and no power switching */
++      temp = 0x000a;
++      desc->wHubCharacteristics = cpu_to_le16(temp);
++}
++
++/* port status change mask for hub_status_data */
++#define PORT_C_MASK \
++      ((USB_PORT_STAT_C_CONNECTION \
++      | USB_PORT_STAT_C_ENABLE \
++      | USB_PORT_STAT_C_SUSPEND \
++      | USB_PORT_STAT_C_OVERCURRENT \
++      | USB_PORT_STAT_C_RESET) << 16)
++
++/*
++ * See USB 2.0 Spec, 11.12.4 Hub and Port Status Change Bitmap.
++ * If port status changed, writes the bitmap to buf and return
++ * that length(number of bytes).
++ * If Nothing changed, return 0.
++ */
++static int xenhcd_hub_status_data(struct usb_hcd *hcd, char *buf)
++{
++      struct usbfront_info *info = hcd_to_info(hcd);
++
++      int ports;
++      int i;
++      int length;
++
++      unsigned long flags;
++      int ret = 0;
++
++      int changed = 0;
++
++      if (!HC_IS_RUNNING(hcd->state))
++              return 0;
++
++      /* initialize the status to no-changes */
++      ports = info->rh_numports;
++      length = 1 + (ports / 8);
++      for (i = 0; i < length; i++) {
++              buf[i] = 0;
++              ret++;
++      }
++
++      spin_lock_irqsave(&info->lock, flags);
++
++      for (i = 0; i < ports; i++) {
++              /* check status for each port */
++              if (info->ports[i].status & PORT_C_MASK) {
++                      if (i < 7)
++                              buf[0] |= 1 << (i + 1);
++                      else if (i < 15)
++                              buf[1] |= 1 << (i - 7);
++                      else if (i < 23)
++                              buf[2] |= 1 << (i - 15);
++                      else
++                              buf[3] |= 1 << (i - 23);
++                      changed = 1;
++              }
++      }
++
++      if (!changed)
++              ret = 0;
++
++      spin_unlock_irqrestore(&info->lock, flags);
++
++      return ret;
++}
++
++static int xenhcd_hub_control(struct usb_hcd *hcd,
++                             u16 typeReq,
++                             u16 wValue,
++                             u16 wIndex,
++                             char *buf,
++                             u16 wLength)
++{
++      struct usbfront_info *info = hcd_to_info(hcd);
++      int ports = info->rh_numports;
++      unsigned long flags;
++      int ret = 0;
++      int i;
++      int changed = 0;
++
++      spin_lock_irqsave(&info->lock, flags);
++      switch (typeReq) {
++      case ClearHubFeature:
++              /* ignore this request */
++              break;
++      case ClearPortFeature:
++              if (!wIndex || wIndex > ports)
++                      goto error;
++
++              switch (wValue) {
++              case USB_PORT_FEAT_SUSPEND:
++                      rhport_resume(info, wIndex);
++                      break;
++              case USB_PORT_FEAT_POWER:
++                      rhport_power_off(info, wIndex);
++                      break;
++              case USB_PORT_FEAT_ENABLE:
++                      rhport_disable(info, wIndex);
++                      break;
++              case USB_PORT_FEAT_C_CONNECTION:
++                      info->ports[wIndex-1].c_connection = 0;
++                      /* falling through */
++              default:
++                      info->ports[wIndex-1].status &= ~(1 << wValue);
++                      break;
++              }
++              break;
++      case GetHubDescriptor:
++              xenhcd_hub_descriptor(info,
++                                    (struct usb_hub_descriptor *) buf);
++              break;
++      case GetHubStatus:
++              /* always local power supply good and no over-current exists. */
++              *(__le32 *)buf = cpu_to_le32(0);
++              break;
++      case GetPortStatus:
++              if (!wIndex || wIndex > ports)
++                      goto error;
++
++              wIndex--;
++
++              /* resume completion */
++              if (info->ports[wIndex].resuming &&
++                      time_after_eq(jiffies, info->ports[wIndex].timeout)) {
++                      info->ports[wIndex].status |= (USB_PORT_STAT_C_SUSPEND << 16);
++                      info->ports[wIndex].status &= ~USB_PORT_STAT_SUSPEND;
++              }
++
++              /* reset completion */
++              if ((info->ports[wIndex].status & USB_PORT_STAT_RESET) != 0 &&
++                      time_after_eq(jiffies, info->ports[wIndex].timeout)) {
++                      info->ports[wIndex].status |= (USB_PORT_STAT_C_RESET << 16);
++                      info->ports[wIndex].status &= ~USB_PORT_STAT_RESET;
++
++                      if (info->devices[wIndex].status != USB_STATE_NOTATTACHED) {
++                              info->ports[wIndex].status |= USB_PORT_STAT_ENABLE;
++                              info->devices[wIndex].status = USB_STATE_DEFAULT;
++                      }
++
++                      switch (info->devices[wIndex].speed) {
++                      case USB_SPEED_LOW:
++                              info->ports[wIndex].status |= USB_PORT_STAT_LOW_SPEED;
++                              break;
++                      case USB_SPEED_HIGH:
++                              info->ports[wIndex].status |= USB_PORT_STAT_HIGH_SPEED;
++                              break;
++                      default:
++                              break;
++                      }
++              }
++
++              ((u16 *) buf)[0] = cpu_to_le16 (info->ports[wIndex].status);
++              ((u16 *) buf)[1] = cpu_to_le16 (info->ports[wIndex].status >> 16);
++              break;
++      case SetHubFeature:
++              /* not supported */
++              goto error;
++      case SetPortFeature:
++              if (!wIndex || wIndex > ports)
++                      goto error;
++
++              switch (wValue) {
++              case USB_PORT_FEAT_POWER:
++                      rhport_power_on(info, wIndex);
++                      break;
++              case USB_PORT_FEAT_RESET:
++                      rhport_reset(info, wIndex);
++                      break;
++              case USB_PORT_FEAT_SUSPEND:
++                      rhport_suspend(info, wIndex);
++                      break;
++              default:
++                      if ((info->ports[wIndex-1].status & USB_PORT_STAT_POWER) != 0)
++                              info->ports[wIndex-1].status |= (1 << wValue);
++              }
++              break;
++
++      default:
++error:
++              ret = -EPIPE;
++      }
++      spin_unlock_irqrestore(&info->lock, flags);
++
++      /* check status for each port */
++      for (i = 0; i < ports; i++) {
++              if (info->ports[i].status & PORT_C_MASK)
++                      changed = 1;
++      }
++      if (changed)
++              usb_hcd_poll_rh_status(hcd);
++
++      return ret;
++}
diff --cc drivers/xen/usbfront/usbfront-q.c

index 0000000,0000000..90dd57f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-q.c
@@@ -1,0 -1,0 +1,542 @@@
++/*
++ * usbfront-q.c
++ *
++ * Xen USB Virtual Host Controller - RING operations.
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ *
++ * or, by your choice,
++ *
++ * When distributed separately from the Linux kernel or incorporated into
++ * other software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++struct kmem_cache *xenhcd_urbp_cachep;
++
++static struct urb_priv *alloc_urb_priv(struct urb *urb)
++{
++      struct urb_priv *urbp;
++
++      urbp = kmem_cache_zalloc(xenhcd_urbp_cachep, GFP_ATOMIC);
++      if (!urbp)
++              return NULL;
++
++      urbp->urb = urb;
++      urb->hcpriv = urbp;
++      urbp->req_id = ~0;
++      urbp->unlink_req_id = ~0;
++      INIT_LIST_HEAD(&urbp->list);
++
++      return urbp;
++}
++
++static void free_urb_priv(struct urb_priv *urbp)
++{
++      urbp->urb->hcpriv = NULL;
++      kmem_cache_free(xenhcd_urbp_cachep, urbp);
++}
++
++static inline int get_id_from_freelist(
++      struct usbfront_info *info)
++{
++      unsigned long free;
++      free = info->shadow_free;
++      BUG_ON(free >= USB_URB_RING_SIZE);
++      info->shadow_free = info->shadow[free].req.id;
++      info->shadow[free].req.id = (unsigned int)0x0fff; /* debug */
++      return free;
++}
++
++static inline void add_id_to_freelist(
++      struct usbfront_info *info, unsigned long id)
++{
++      info->shadow[id].req.id  = info->shadow_free;
++      info->shadow[id].urb = NULL;
++      info->shadow_free = id;
++}
++
++static inline int count_pages(void *addr, int length)
++{
++      unsigned long start = (unsigned long) addr >> PAGE_SHIFT;
++      unsigned long end = (unsigned long) (addr + length + PAGE_SIZE - 1) >> PAGE_SHIFT;
++      return end - start;
++}
++
++static inline void xenhcd_gnttab_map(struct usbfront_info *info,
++              void *addr, int length, grant_ref_t *gref_head,
++              struct usbif_request_segment *seg, int nr_pages, int flags)
++{
++      grant_ref_t ref;
++      struct page *page;
++      unsigned long buffer_pfn;
++      unsigned int offset;
++      unsigned int len;
++      unsigned int bytes;
++      int i;
++
++      len = length;
++
++      for (i = 0; i < nr_pages; i++) {
++              BUG_ON(!len);
++
++              page = virt_to_page(addr);
++              buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
++              offset = offset_in_page(addr);
++
++              bytes = PAGE_SIZE - offset;
++              if (bytes > len)
++                      bytes = len;
++
++              ref = gnttab_claim_grant_reference(gref_head);
++              BUG_ON(ref == -ENOSPC);
++              gnttab_grant_foreign_access_ref(ref, info->xbdev->otherend_id, buffer_pfn, flags);
++              seg[i].gref = ref;
++              seg[i].offset = (uint16_t)offset;
++              seg[i].length = (uint16_t)bytes;
++
++              addr += bytes;
++              len -= bytes;
++      }
++}
++
++static int map_urb_for_request(struct usbfront_info *info, struct urb *urb,
++              usbif_urb_request_t *req)
++{
++      grant_ref_t gref_head;
++      int nr_buff_pages = 0;
++      int nr_isodesc_pages = 0;
++      int ret = 0;
++
++      if (urb->transfer_buffer_length) {
++              nr_buff_pages = count_pages(urb->transfer_buffer, urb->transfer_buffer_length);
++
++              if (usb_pipeisoc(urb->pipe))
++                      nr_isodesc_pages = count_pages(&urb->iso_frame_desc[0],
++                                      sizeof(struct usb_iso_packet_descriptor) * urb->number_of_packets);
++
++              if (nr_buff_pages + nr_isodesc_pages > USBIF_MAX_SEGMENTS_PER_REQUEST)
++                      return -E2BIG;
++
++              ret = gnttab_alloc_grant_references(USBIF_MAX_SEGMENTS_PER_REQUEST, &gref_head);
++              if (ret) {
++                      pr_err("usbfront: gnttab_alloc_grant_references() error\n");
++                      return -ENOMEM;
++              }
++
++              xenhcd_gnttab_map(info, urb->transfer_buffer,
++                              urb->transfer_buffer_length,
++                              &gref_head, &req->seg[0], nr_buff_pages,
++                              usb_pipein(urb->pipe) ? 0 : GTF_readonly);
++
++              if (!usb_pipeisoc(urb->pipe))
++                      gnttab_free_grant_references(gref_head);
++      }
++
++      req->pipe = usbif_setportnum_pipe(urb->pipe, urb->dev->portnum);
++      req->transfer_flags = urb->transfer_flags;
++      req->buffer_length = urb->transfer_buffer_length;
++      req->nr_buffer_segs = nr_buff_pages;
++
++      switch (usb_pipetype(urb->pipe)) {
++      case PIPE_ISOCHRONOUS:
++              req->u.isoc.interval = urb->interval;
++              req->u.isoc.start_frame = urb->start_frame;
++              req->u.isoc.number_of_packets = urb->number_of_packets;
++              req->u.isoc.nr_frame_desc_segs = nr_isodesc_pages;
++              /* urb->number_of_packets must be > 0 */
++              if (unlikely(urb->number_of_packets <= 0))
++                      BUG();
++              xenhcd_gnttab_map(info, &urb->iso_frame_desc[0],
++                      sizeof(struct usb_iso_packet_descriptor) * urb->number_of_packets,
++                      &gref_head, &req->seg[nr_buff_pages], nr_isodesc_pages, 0);
++              gnttab_free_grant_references(gref_head);
++              break;
++      case PIPE_INTERRUPT:
++              req->u.intr.interval = urb->interval;
++              break;
++      case PIPE_CONTROL:
++              if (urb->setup_packet)
++                      memcpy(req->u.ctrl, urb->setup_packet, 8);
++              break;
++      case PIPE_BULK:
++              break;
++      default:
++              ret = -EINVAL;
++      }
++
++      return ret;
++}
++
++static void xenhcd_gnttab_done(struct usb_shadow *shadow)
++{
++      int nr_segs = 0;
++      int i;
++
++      nr_segs = shadow->req.nr_buffer_segs;
++
++      if (usb_pipeisoc(shadow->req.pipe))
++              nr_segs +=  shadow->req.u.isoc.nr_frame_desc_segs;
++
++      for (i = 0; i < nr_segs; i++)
++              gnttab_end_foreign_access(shadow->req.seg[i].gref, 0UL);
++
++      shadow->req.nr_buffer_segs = 0;
++      shadow->req.u.isoc.nr_frame_desc_segs = 0;
++}
++
++static void xenhcd_giveback_urb(struct usbfront_info *info, struct urb *urb, int status)
++__releases(info->lock)
++__acquires(info->lock)
++{
++      struct urb_priv *urbp = (struct urb_priv *) urb->hcpriv;
++
++      list_del_init(&urbp->list);
++      free_urb_priv(urbp);
++      switch (urb->status) {
++      case -ECONNRESET:
++      case -ENOENT:
++              COUNT(info->stats.unlink);
++              break;
++      case -EINPROGRESS:
++              urb->status = status;
++              /* falling through */
++      default:
++              COUNT(info->stats.complete);
++      }
++      spin_unlock(&info->lock);
++      usb_hcd_giveback_urb(info_to_hcd(info), urb,
++                           urbp->status <= 0 ? urbp->status : urb->status);
++      spin_lock(&info->lock);
++}
++
++static inline int xenhcd_do_request(struct usbfront_info *info, struct urb_priv *urbp)
++{
++      usbif_urb_request_t *req;
++      struct urb *urb = urbp->urb;
++      uint16_t id;
++      int notify;
++      int ret = 0;
++
++      req = RING_GET_REQUEST(&info->urb_ring, info->urb_ring.req_prod_pvt);
++      id = get_id_from_freelist(info);
++      req->id = id;
++
++      if (unlikely(urbp->unlinked)) {
++              req->u.unlink.unlink_id = urbp->req_id;
++              req->pipe = usbif_setunlink_pipe(usbif_setportnum_pipe(
++                              urb->pipe, urb->dev->portnum));
++              urbp->unlink_req_id = id;
++      } else {
++              ret = map_urb_for_request(info, urb, req);
++              if (ret < 0) {
++                      add_id_to_freelist(info, id);
++                      return ret;
++              }
++              urbp->req_id = id;
++      }
++
++      info->urb_ring.req_prod_pvt++;
++      info->shadow[id].urb = urb;
++      info->shadow[id].req = *req;
++
++      RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->urb_ring, notify);
++      if (notify)
++              notify_remote_via_irq(info->irq);
++
++      return ret;
++}
++
++static void xenhcd_kick_pending_urbs(struct usbfront_info *info)
++{
++      struct urb_priv *urbp;
++      int ret;
++
++      while (!list_empty(&info->pending_submit_list)) {
++              if (RING_FULL(&info->urb_ring)) {
++                      COUNT(info->stats.ring_full);
++                      timer_action(info, TIMER_RING_WATCHDOG);
++                      goto done;
++              }
++
++              urbp = list_entry(info->pending_submit_list.next, struct urb_priv, list);
++              ret = xenhcd_do_request(info, urbp);
++              if (ret == 0)
++                      list_move_tail(&urbp->list, &info->in_progress_list);
++              else
++                      xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN);
++      }
++      timer_action_done(info, TIMER_SCAN_PENDING_URBS);
++
++done:
++      return;
++}
++
++/*
++ * caller must lock info->lock
++ */
++static void xenhcd_cancel_all_enqueued_urbs(struct usbfront_info *info)
++{
++      struct urb_priv *urbp, *tmp;
++
++      list_for_each_entry_safe(urbp, tmp, &info->in_progress_list, list) {
++              if (!urbp->unlinked) {
++                      xenhcd_gnttab_done(&info->shadow[urbp->req_id]);
++                      barrier();
++                      if (urbp->urb->status == -EINPROGRESS)  /* not dequeued */
++                              xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN);
++                      else                                    /* dequeued */
++                              xenhcd_giveback_urb(info, urbp->urb, urbp->urb->status);
++              }
++              info->shadow[urbp->req_id].urb = NULL;
++      }
++
++      list_for_each_entry_safe(urbp, tmp, &info->pending_submit_list, list) {
++              xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN);
++      }
++
++      return;
++}
++
++/*
++ * caller must lock info->lock
++ */
++static void xenhcd_giveback_unlinked_urbs(struct usbfront_info *info)
++{
++      struct urb_priv *urbp, *tmp;
++
++      list_for_each_entry_safe(urbp, tmp, &info->giveback_waiting_list, list) {
++              xenhcd_giveback_urb(info, urbp->urb, urbp->urb->status);
++      }
++}
++
++static int xenhcd_submit_urb(struct usbfront_info *info, struct urb_priv *urbp)
++{
++      int ret = 0;
++
++      if (RING_FULL(&info->urb_ring)) {
++              list_add_tail(&urbp->list, &info->pending_submit_list);
++              COUNT(info->stats.ring_full);
++              timer_action(info, TIMER_RING_WATCHDOG);
++              goto done;
++      }
++
++      if (!list_empty(&info->pending_submit_list)) {
++              list_add_tail(&urbp->list, &info->pending_submit_list);
++              timer_action(info, TIMER_SCAN_PENDING_URBS);
++              goto done;
++      }
++
++      ret = xenhcd_do_request(info, urbp);
++      if (ret == 0)
++              list_add_tail(&urbp->list, &info->in_progress_list);
++
++done:
++      return ret;
++}
++
++static int xenhcd_unlink_urb(struct usbfront_info *info, struct urb_priv *urbp)
++{
++      int ret = 0;
++
++      /* already unlinked? */
++      if (urbp->unlinked)
++              return -EBUSY;
++
++      urbp->unlinked = 1;
++
++      /* the urb is still in pending_submit queue */
++      if (urbp->req_id == ~0) {
++              list_move_tail(&urbp->list, &info->giveback_waiting_list);
++              timer_action(info, TIMER_SCAN_PENDING_URBS);
++              goto done;
++      }
++
++      /* send unlink request to backend */
++      if (RING_FULL(&info->urb_ring)) {
++              list_move_tail(&urbp->list, &info->pending_unlink_list);
++              COUNT(info->stats.ring_full);
++              timer_action(info, TIMER_RING_WATCHDOG);
++              goto done;
++      }
++
++      if (!list_empty(&info->pending_unlink_list)) {
++              list_move_tail(&urbp->list, &info->pending_unlink_list);
++              timer_action(info, TIMER_SCAN_PENDING_URBS);
++              goto done;
++      }
++
++      ret = xenhcd_do_request(info, urbp);
++      if (ret == 0)
++              list_move_tail(&urbp->list, &info->in_progress_list);
++
++done:
++      return ret;
++}
++
++static int xenhcd_urb_request_done(struct usbfront_info *info)
++{
++      usbif_urb_response_t *res;
++      struct urb *urb;
++
++      RING_IDX i, rp;
++      uint16_t id;
++      int more_to_do = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&info->lock, flags);
++
++      rp = info->urb_ring.sring->rsp_prod;
++      rmb(); /* ensure we see queued responses up to "rp" */
++
++      for (i = info->urb_ring.rsp_cons; i != rp; i++) {
++              res = RING_GET_RESPONSE(&info->urb_ring, i);
++              id = res->id;
++
++              if (likely(usbif_pipesubmit(info->shadow[id].req.pipe))) {
++                      xenhcd_gnttab_done(&info->shadow[id]);
++                      urb = info->shadow[id].urb;
++                      barrier();
++                      if (likely(urb)) {
++                              urb->actual_length = res->actual_length;
++                              urb->error_count = res->error_count;
++                              urb->start_frame = res->start_frame;
++                              barrier();
++                              xenhcd_giveback_urb(info, urb, res->status);
++                      }
++              }
++
++              add_id_to_freelist(info, id);
++      }
++      info->urb_ring.rsp_cons = i;
++
++      if (i != info->urb_ring.req_prod_pvt)
++              RING_FINAL_CHECK_FOR_RESPONSES(&info->urb_ring, more_to_do);
++      else
++              info->urb_ring.sring->rsp_event = i + 1;
++
++      spin_unlock_irqrestore(&info->lock, flags);
++
++      cond_resched();
++
++      return more_to_do;
++}
++
++static int xenhcd_conn_notify(struct usbfront_info *info)
++{
++      usbif_conn_response_t *res;
++      usbif_conn_request_t *req;
++      RING_IDX rc, rp;
++      uint16_t id;
++      uint8_t portnum, speed;
++      int more_to_do = 0;
++      int notify;
++      int port_changed = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&info->lock, flags);
++
++      rc = info->conn_ring.rsp_cons;
++      rp = info->conn_ring.sring->rsp_prod;
++      rmb(); /* ensure we see queued responses up to "rp" */
++
++      while (rc != rp) {
++              res = RING_GET_RESPONSE(&info->conn_ring, rc);
++              id = res->id;
++              portnum = res->portnum;
++              speed = res->speed;
++              info->conn_ring.rsp_cons = ++rc;
++
++              rhport_connect(info, portnum, speed);
++              if (info->ports[portnum-1].c_connection)
++                      port_changed = 1;
++
++              barrier();
++
++              req = RING_GET_REQUEST(&info->conn_ring, info->conn_ring.req_prod_pvt);
++              req->id = id;
++              info->conn_ring.req_prod_pvt++;
++      }
++
++      if (rc != info->conn_ring.req_prod_pvt)
++              RING_FINAL_CHECK_FOR_RESPONSES(&info->conn_ring, more_to_do);
++      else
++              info->conn_ring.sring->rsp_event = rc + 1;
++
++      RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->conn_ring, notify);
++      if (notify)
++              notify_remote_via_irq(info->irq);
++
++      spin_unlock_irqrestore(&info->lock, flags);
++
++      if (port_changed)
++              usb_hcd_poll_rh_status(info_to_hcd(info));
++
++      cond_resched();
++
++      return more_to_do;
++}
++
++int xenhcd_schedule(void *arg)
++{
++      struct usbfront_info *info = (struct usbfront_info *) arg;
++
++      while (!kthread_should_stop()) {
++              wait_event_interruptible(
++                              info->wq,
++                              info->waiting_resp || kthread_should_stop());
++              info->waiting_resp = 0;
++              smp_mb();
++
++              if (xenhcd_urb_request_done(info))
++                      info->waiting_resp = 1;
++
++              if (xenhcd_conn_notify(info))
++                      info->waiting_resp = 1;
++      }
++
++      return 0;
++}
++
++static void xenhcd_notify_work(struct usbfront_info *info)
++{
++      info->waiting_resp = 1;
++      wake_up(&info->wq);
++}
++
++irqreturn_t xenhcd_int(int irq, void *dev_id)
++{
++      xenhcd_notify_work((struct usbfront_info *) dev_id);
++      return IRQ_HANDLED;
++}
diff --cc drivers/xen/usbfront/usbfront.h

index 0000000,0000000..a260114

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront.h
@@@ -1,0 -1,0 +1,197 @@@
++/*
++ * usbfront.h
++ *
++ * This file is part of Xen USB Virtual Host Controller driver.
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ *
++ * or, by your choice,
++ *
++ * When distributed separately from the Linux kernel or incorporated into
++ * other software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_USBFRONT_H__
++#define __XEN_USBFRONT_H__
++
++#include <linux/module.h>
++#include <linux/usb.h>
++#include <linux/list.h>
++#include <linux/kthread.h>
++#include <linux/wait.h>
++#include <linux/usb/hcd.h>
++#include <asm/io.h>
++#include <xen/xenbus.h>
++#include <xen/evtchn.h>
++#include <xen/gnttab.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/io/usbif.h>
++
++static inline struct usbfront_info *hcd_to_info(struct usb_hcd *hcd)
++{
++      return (struct usbfront_info *) (hcd->hcd_priv);
++}
++
++static inline struct usb_hcd *info_to_hcd(struct usbfront_info *info)
++{
++      return container_of((void *) info, struct usb_hcd, hcd_priv);
++}
++
++/* Private per-URB data */
++struct urb_priv {
++      struct list_head list;
++      struct urb *urb;
++      int req_id;     /* RING_REQUEST id for submitting */
++      int unlink_req_id; /* RING_REQUEST id for unlinking */
++      int status;
++      unsigned unlinked:1; /* dequeued marker */
++};
++
++/* virtual roothub port status */
++struct rhport_status {
++      u32 status;
++      unsigned resuming:1; /* in resuming */
++      unsigned c_connection:1; /* connection changed */
++      unsigned long timeout;
++};
++
++/* status of attached device */
++struct vdevice_status {
++      int devnum;
++      enum usb_device_state status;
++      enum usb_device_speed speed;
++};
++
++/* RING request shadow */
++struct usb_shadow {
++      usbif_urb_request_t req;
++      struct urb *urb;
++};
++
++/* statistics for tuning, monitoring, ... */
++struct xenhcd_stats {
++      unsigned long ring_full; /* RING_FULL conditions */
++      unsigned long complete; /* normal givebacked urbs */
++      unsigned long unlink; /* unlinked urbs */
++};
++
++struct usbfront_info {
++      /* Virtual Host Controller has 4 urb queues */
++      struct list_head pending_submit_list;
++      struct list_head pending_unlink_list;
++      struct list_head in_progress_list;
++      struct list_head giveback_waiting_list;
++
++      spinlock_t lock;
++
++      /* timer that kick pending and giveback waiting urbs */
++      struct timer_list watchdog;
++      unsigned long actions;
++
++      /* virtual root hub */
++      int rh_numports;
++      struct rhport_status ports[USB_MAXCHILDREN];
++      struct vdevice_status devices[USB_MAXCHILDREN];
++
++      /* Xen related staff */
++      struct xenbus_device *xbdev;
++      int urb_ring_ref;
++      int conn_ring_ref;
++      usbif_urb_front_ring_t urb_ring;
++      usbif_conn_front_ring_t conn_ring;
++
++      unsigned int irq; /* event channel */
++      struct usb_shadow shadow[USB_URB_RING_SIZE];
++      unsigned long shadow_free;
++
++      /* RING_RESPONSE thread */
++      struct task_struct *kthread;
++      wait_queue_head_t wq;
++      unsigned int waiting_resp;
++
++      /* xmit statistics */
++#ifdef XENHCD_STATS
++      struct xenhcd_stats stats;
++#define COUNT(x) do { (x)++; } while (0)
++#else
++#define COUNT(x) do {} while (0)
++#endif
++};
++
++#define XENHCD_RING_JIFFIES (HZ/200)
++#define XENHCD_SCAN_JIFFIES 1
++
++enum xenhcd_timer_action {
++      TIMER_RING_WATCHDOG,
++      TIMER_SCAN_PENDING_URBS,
++};
++
++static inline void
++timer_action_done(struct usbfront_info *info, enum xenhcd_timer_action action)
++{
++      clear_bit(action, &info->actions);
++}
++
++static inline void
++timer_action(struct usbfront_info *info, enum xenhcd_timer_action action)
++{
++      if (timer_pending(&info->watchdog)
++                      && test_bit(TIMER_SCAN_PENDING_URBS, &info->actions))
++              return;
++
++      if (!test_and_set_bit(action, &info->actions)) {
++              unsigned long t;
++
++              switch (action) {
++              case TIMER_RING_WATCHDOG:
++                      t = XENHCD_RING_JIFFIES;
++                      break;
++              default:
++                      t = XENHCD_SCAN_JIFFIES;
++                      break;
++              }
++              mod_timer(&info->watchdog, t + jiffies);
++      }
++}
++
++extern struct kmem_cache *xenhcd_urbp_cachep;
++extern struct hc_driver xen_usb20_hc_driver;
++extern struct hc_driver xen_usb11_hc_driver;
++irqreturn_t xenhcd_int(int irq, void *dev_id);
++void xenhcd_rhport_state_change(struct usbfront_info *info,
++                              int port, enum usb_device_speed speed);
++int xenhcd_schedule(void *arg);
++
++#endif /* __XEN_USBFRONT_H__ */
diff --cc drivers/xen/usbfront/xenbus.c

index 0000000,0000000..2aa48c6

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/usbfront/xenbus.c
@@@ -1,0 -1,0 +1,416 @@@
++/*
++ * xenbus.c
++ *
++ * Xenbus interface for Xen USB Virtual Host Controller
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, see <http://www.gnu.org/licenses/>.
++ *
++ * or, by your choice,
++ *
++ * When distributed separately from the Linux kernel or incorporated into
++ * other software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#include "usbfront.h"
++
++#define GRANT_INVALID_REF 0
++
++static void destroy_rings(struct usbfront_info *info)
++{
++      if (info->irq)
++              unbind_from_irqhandler(info->irq, info);
++      info->irq = 0;
++
++      if (info->urb_ring_ref != GRANT_INVALID_REF) {
++              gnttab_end_foreign_access(info->urb_ring_ref,
++                                        (unsigned long)info->urb_ring.sring);
++              info->urb_ring_ref = GRANT_INVALID_REF;
++      }
++      info->urb_ring.sring = NULL;
++
++      if (info->conn_ring_ref != GRANT_INVALID_REF) {
++              gnttab_end_foreign_access(info->conn_ring_ref,
++                                        (unsigned long)info->conn_ring.sring);
++              info->conn_ring_ref = GRANT_INVALID_REF;
++      }
++      info->conn_ring.sring = NULL;
++}
++
++static int setup_rings(struct xenbus_device *dev,
++                         struct usbfront_info *info)
++{
++      usbif_urb_sring_t *urb_sring;
++      usbif_conn_sring_t *conn_sring;
++      int err;
++
++      info->urb_ring_ref = GRANT_INVALID_REF;
++      info->conn_ring_ref = GRANT_INVALID_REF;
++
++      urb_sring = (usbif_urb_sring_t *)get_zeroed_page(GFP_NOIO|__GFP_HIGH);
++      if (!urb_sring) {
++              xenbus_dev_fatal(dev, -ENOMEM, "allocating urb ring");
++              return -ENOMEM;
++      }
++      SHARED_RING_INIT(urb_sring);
++      FRONT_RING_INIT(&info->urb_ring, urb_sring, PAGE_SIZE);
++
++      err = xenbus_grant_ring(dev, virt_to_mfn(info->urb_ring.sring));
++      if (err < 0) {
++              free_page((unsigned long)urb_sring);
++              info->urb_ring.sring = NULL;
++              goto fail;
++      }
++      info->urb_ring_ref = err;
++
++      conn_sring = (usbif_conn_sring_t *)get_zeroed_page(GFP_NOIO|__GFP_HIGH);
++      if (!conn_sring) {
++              xenbus_dev_fatal(dev, -ENOMEM, "allocating conn ring");
++              return -ENOMEM;
++      }
++      SHARED_RING_INIT(conn_sring);
++      FRONT_RING_INIT(&info->conn_ring, conn_sring, PAGE_SIZE);
++
++      err = xenbus_grant_ring(dev, virt_to_mfn(info->conn_ring.sring));
++      if (err < 0) {
++              free_page((unsigned long)conn_sring);
++              info->conn_ring.sring = NULL;
++              goto fail;
++      }
++      info->conn_ring_ref = err;
++
++      err = bind_listening_port_to_irqhandler(
++              dev->otherend_id, xenhcd_int, IRQF_SAMPLE_RANDOM, "usbif", info);
++      if (err <= 0) {
++              xenbus_dev_fatal(dev, err,
++                               "bind_listening_port_to_irqhandler");
++              goto fail;
++      }
++      info->irq = err;
++
++      return 0;
++fail:
++      destroy_rings(info);
++      return err;
++}
++
++static int talk_to_backend(struct xenbus_device *dev,
++                         struct usbfront_info *info)
++{
++      const char *message;
++      struct xenbus_transaction xbt;
++      int err;
++
++      err = setup_rings(dev, info);
++      if (err)
++              goto out;
++
++again:
++      err = xenbus_transaction_start(&xbt);
++      if (err) {
++              xenbus_dev_fatal(dev, err, "starting transaction");
++              goto destroy_ring;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "urb-ring-ref", "%u",
++                          info->urb_ring_ref);
++      if (err) {
++              message = "writing urb-ring-ref";
++              goto abort_transaction;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "conn-ring-ref", "%u",
++                          info->conn_ring_ref);
++      if (err) {
++              message = "writing conn-ring-ref";
++              goto abort_transaction;
++      }
++
++      err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
++                          irq_to_evtchn_port(info->irq));
++      if (err) {
++              message = "writing event-channel";
++              goto abort_transaction;
++      }
++
++      err = xenbus_transaction_end(xbt, 0);
++      if (err) {
++              if (err == -EAGAIN)
++                      goto again;
++              xenbus_dev_fatal(dev, err, "completing transaction");
++              goto destroy_ring;
++      }
++
++      return 0;
++
++abort_transaction:
++      xenbus_transaction_end(xbt, 1);
++      xenbus_dev_fatal(dev, err, "%s", message);
++
++destroy_ring:
++      destroy_rings(info);
++
++out:
++      return err;
++}
++
++static int connect(struct xenbus_device *dev)
++{
++      struct usbfront_info *info = dev_get_drvdata(&dev->dev);
++
++      usbif_conn_request_t *req;
++      int i, idx, err;
++      int notify;
++      char name[TASK_COMM_LEN];
++      struct usb_hcd *hcd;
++
++      hcd = info_to_hcd(info);
++      snprintf(name, TASK_COMM_LEN, "xenhcd.%d", hcd->self.busnum);
++
++      err = talk_to_backend(dev, info);
++      if (err)
++              return err;
++
++      info->kthread = kthread_run(xenhcd_schedule, info, name);
++      if (IS_ERR(info->kthread)) {
++              err = PTR_ERR(info->kthread);
++              info->kthread = NULL;
++              xenbus_dev_fatal(dev, err, "Error creating thread");
++              return err;
++      }
++      /* prepare ring for hotplug notification */
++      for (idx = 0, i = 0; i < USB_CONN_RING_SIZE; i++) {
++              req = RING_GET_REQUEST(&info->conn_ring, idx);
++              req->id = idx;
++              idx++;
++      }
++      info->conn_ring.req_prod_pvt = idx;
++
++      RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->conn_ring, notify);
++      if (notify)
++              notify_remote_via_irq(info->irq);
++
++      return 0;
++}
++
++static struct usb_hcd *create_hcd(struct xenbus_device *dev)
++{
++      int i;
++      int err = 0;
++      int num_ports;
++      int usb_ver;
++      struct usb_hcd *hcd = NULL;
++      struct usbfront_info *info = NULL;
++
++      err = xenbus_scanf(XBT_NIL, dev->otherend,
++                                      "num-ports", "%d", &num_ports);
++      if (err != 1) {
++              xenbus_dev_fatal(dev, err, "reading num-ports");
++              return ERR_PTR(-EINVAL);
++      }
++      if (num_ports < 1 || num_ports > USB_MAXCHILDREN) {
++              xenbus_dev_fatal(dev, err, "invalid num-ports");
++              return ERR_PTR(-EINVAL);
++      }
++
++      err = xenbus_scanf(XBT_NIL, dev->otherend,
++                                      "usb-ver", "%d", &usb_ver);
++      if (err != 1) {
++              xenbus_dev_fatal(dev, err, "reading usb-ver");
++              return ERR_PTR(-EINVAL);
++      }
++      switch (usb_ver) {
++      case USB_VER_USB11:
++              hcd = usb_create_hcd(&xen_usb11_hc_driver, &dev->dev, dev_name(&dev->dev));
++              break;
++      case USB_VER_USB20:
++              hcd = usb_create_hcd(&xen_usb20_hc_driver, &dev->dev, dev_name(&dev->dev));
++              break;
++      default:
++              xenbus_dev_fatal(dev, err, "invalid usb-ver");
++              return ERR_PTR(-EINVAL);
++      }
++      if (!hcd) {
++              xenbus_dev_fatal(dev, err,
++                              "fail to allocate USB host controller");
++              return ERR_PTR(-ENOMEM);
++      }
++
++      info = hcd_to_info(hcd);
++      info->xbdev = dev;
++      info->rh_numports = num_ports;
++
++      for (i = 0; i < USB_URB_RING_SIZE; i++) {
++              info->shadow[i].req.id = i + 1;
++              info->shadow[i].urb = NULL;
++      }
++      info->shadow[USB_URB_RING_SIZE-1].req.id = 0x0fff;
++
++      return hcd;
++}
++
++static int usbfront_probe(struct xenbus_device *dev,
++                        const struct xenbus_device_id *id)
++{
++      int err;
++      struct usb_hcd *hcd;
++      struct usbfront_info *info;
++
++      if (usb_disabled())
++              return -ENODEV;
++
++      hcd = create_hcd(dev);
++      if (IS_ERR(hcd)) {
++              err = PTR_ERR(hcd);
++              xenbus_dev_fatal(dev, err,
++                              "fail to create usb host controller");
++              goto fail;
++      }
++
++      info = hcd_to_info(hcd);
++      dev_set_drvdata(&dev->dev, info);
++
++      err = usb_add_hcd(hcd, 0, 0);
++      if (err != 0) {
++              xenbus_dev_fatal(dev, err,
++                              "fail to adding USB host controller");
++              goto fail;
++      }
++
++      init_waitqueue_head(&info->wq);
++
++      return 0;
++
++fail:
++      usb_put_hcd(hcd);
++      dev_set_drvdata(&dev->dev, NULL);
++      return err;
++}
++
++static void usbfront_disconnect(struct xenbus_device *dev)
++{
++      struct usbfront_info *info = dev_get_drvdata(&dev->dev);
++      struct usb_hcd *hcd = info_to_hcd(info);
++
++      usb_remove_hcd(hcd);
++      if (info->kthread) {
++              kthread_stop(info->kthread);
++              info->kthread = NULL;
++      }
++      xenbus_frontend_closed(dev);
++}
++
++static void backend_changed(struct xenbus_device *dev,
++                                   enum xenbus_state backend_state)
++{
++      switch (backend_state) {
++      case XenbusStateInitialising:
++      case XenbusStateInitialised:
++      case XenbusStateConnected:
++      case XenbusStateReconfiguring:
++      case XenbusStateReconfigured:
++      case XenbusStateUnknown:
++      case XenbusStateClosed:
++              break;
++
++      case XenbusStateInitWait:
++              if (dev->state != XenbusStateInitialising)
++                      break;
++              if (!connect(dev))
++                      xenbus_switch_state(dev, XenbusStateConnected);
++              break;
++
++      case XenbusStateClosing:
++              usbfront_disconnect(dev);
++              break;
++
++      default:
++              xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++                               backend_state);
++              break;
++      }
++}
++
++static int usbfront_remove(struct xenbus_device *dev)
++{
++      struct usbfront_info *info = dev_get_drvdata(&dev->dev);
++      struct usb_hcd *hcd = info_to_hcd(info);
++
++      destroy_rings(info);
++      usb_put_hcd(hcd);
++
++      return 0;
++}
++
++static const struct xenbus_device_id usbfront_ids[] = {
++      { "vusb" },
++      { "" },
++};
++MODULE_ALIAS("xen:vusb");
++
++static struct xenbus_driver usbfront_driver = {
++      .name = "vusb",
++      .ids = usbfront_ids,
++      .probe = usbfront_probe,
++      .otherend_changed = backend_changed,
++      .remove = usbfront_remove,
++};
++
++static int __init usbfront_init(void)
++{
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++      xenhcd_urbp_cachep = kmem_cache_create("xenhcd_urb_priv",
++                      sizeof(struct urb_priv), 0, 0, NULL);
++      if (!xenhcd_urbp_cachep) {
++              pr_err("usbfront failed to create kmem cache\n");
++              return -ENOMEM;
++      }
++
++      return xenbus_register_frontend(&usbfront_driver);
++}
++
++static void __exit usbfront_exit(void)
++{
++      kmem_cache_destroy(xenhcd_urbp_cachep);
++      xenbus_unregister_driver(&usbfront_driver);
++}
++
++module_init(usbfront_init);
++module_exit(usbfront_exit);
++
++MODULE_AUTHOR("");
++MODULE_DESCRIPTION("Xen USB Virtual Host Controller driver (usbfront)");
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/util.c

index 0000000,0000000..412f19a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/util.c
@@@ -1,0 -1,0 +1,74 @@@
++#include <linux/err.h>
++#include <linux/module.h>
++#include <linux/mutex.h>
++#include <linux/slab.h>
++#include <xen/driver_util.h>
++
++static struct class *_get_xen_class(void)
++{
++      static struct class *xen_class;
++      static DEFINE_MUTEX(xc_mutex);
++
++      mutex_lock(&xc_mutex);
++      if (IS_ERR_OR_NULL(xen_class))
++              xen_class = class_create(THIS_MODULE, "xen");
++      mutex_unlock(&xc_mutex);
++      if (IS_ERR(xen_class))
++              pr_err("failed to create xen sysfs class\n");
++
++      return xen_class;
++}
++
++struct class *get_xen_class(void)
++{
++      struct class *class = _get_xen_class();
++
++      return !IS_ERR(class) ? class : NULL;
++}
++EXPORT_SYMBOL_GPL(get_xen_class);
++
++static void xcdev_release(struct device *dev)
++{
++      kfree(dev);
++}
++
++struct device *xen_class_device_create(struct device_type *type,
++                                     struct device *parent,
++                                     dev_t devt, void *drvdata,
++                                     const char *fmt, ...)
++{
++      struct device *dev;
++      int err;
++
++      dev = kzalloc(sizeof(*dev), GFP_KERNEL);
++      if (dev) {
++              va_list vargs;
++
++              va_start(vargs, fmt);
++              err = kobject_set_name_vargs(&dev->kobj, fmt, vargs);
++              va_end(vargs);
++      } else
++              err = -ENOMEM;
++
++      if (!err) {
++              dev->devt = devt;
++              dev->class = _get_xen_class();
++              if (IS_ERR(dev->class))
++                      err = PTR_ERR(dev->class);
++      }
++
++      if (!err) {
++              dev->type = type;
++              dev->parent = parent;
++              dev_set_drvdata(dev, drvdata);
++              dev->release = xcdev_release;
++              err = device_register(dev);
++              if (!err)
++                      return dev;
++              put_device(dev);
++      } else
++              kfree(dev);
++
++      return ERR_PTR(err);
++}
++EXPORT_SYMBOL_GPL(xen_class_device_create);
diff --cc drivers/xen/xenbus/Makefile

index 8dca685,8dca685..5605be5
--- 1/drivers/xen/xenbus/Makefile
--- 2/drivers/xen/xenbus/Makefile
+++ b/drivers/xen/xenbus/Makefile
@@@ -1,12 -1,12 +1,12 @@@
--obj-y += xenbus.o
++obj-y += xenbus_client.o xenbus_comms.o xenbus_xs.o xenbus_probe.o
++obj-$(CONFIG_XEN_BACKEND) += xenbus_be.o
   
--xenbus-objs =
--xenbus-objs += xenbus_client.o
--xenbus-objs += xenbus_comms.o
--xenbus-objs += xenbus_xs.o
--xenbus-objs += xenbus_probe.o
++xenbus_be-objs =
++xenbus_be-objs += xenbus_backend_client.o
   
--xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
--xenbus-objs += $(xenbus-be-objs-y)
++xenbus-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
++obj-y += $(xenbus-y) $(xenbus-m)
++obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o
   
++obj-$(CONFIG_PARAVIRT_XEN_BACKEND) += xenbus_probe_backend.o
   obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
diff --cc drivers/xen/xenbus/xenbus_backend_client.c

index 0000000,0000000..5f8aeb3

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_backend_client.c
@@@ -1,0 -1,0 +1,106 @@@
++/******************************************************************************
++ * Backend-client-facing interface for the Xenbus driver.  In other words, the
++ * interface between the Xenbus and the device-specific code in the backend
++ * driver.
++ *
++ * Copyright (C) 2005-2006 XenSource Ltd
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/err.h>
++#include <linux/delay.h>
++#include <linux/vmalloc.h>
++#include <xen/gnttab.h>
++#include <xen/xenbus.h>
++
++/* Based on Rusty Russell's skeleton driver's map_page */
++struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t gnt_ref)
++{
++      struct gnttab_map_grant_ref op;
++      struct vm_struct *area;
++
++      area = alloc_vm_area(PAGE_SIZE);
++      if (!area)
++              return ERR_PTR(-ENOMEM);
++
++      gnttab_set_map_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
++                        gnt_ref, dev->otherend_id);
++      
++      gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &op);
++
++      if (op.status != GNTST_okay) {
++              free_vm_area(area);
++              xenbus_dev_fatal(dev, op.status,
++                               "mapping in shared page %d from domain %d",
++                               gnt_ref, dev->otherend_id);
++              BUG_ON(!IS_ERR(ERR_PTR(op.status)));
++              return ERR_PTR(-EINVAL);
++      }
++
++      /* Stuff the handle in an unused field */
++      area->phys_addr = (unsigned long)op.handle;
++
++      return area;
++}
++EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
++
++
++/* Based on Rusty Russell's skeleton driver's unmap_page */
++int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *area)
++{
++      struct gnttab_unmap_grant_ref op;
++
++      gnttab_set_unmap_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
++                          (grant_handle_t)area->phys_addr);
++
++      if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++              BUG();
++
++      if (op.status == GNTST_okay)
++              free_vm_area(area);
++      else
++              xenbus_dev_error(dev, op.status,
++                               "unmapping page at handle %d error %d",
++                               (int16_t)area->phys_addr, op.status);
++
++      return op.status == GNTST_okay ? 0 : -EINVAL;
++}
++EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
++
++
++int xenbus_dev_is_online(struct xenbus_device *dev)
++{
++      int rc, val;
++
++      rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
++      if (rc != 1)
++              val = 0; /* no online node present */
++
++      return val;
++}
++EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
++
++MODULE_LICENSE("Dual BSD/GPL");
diff --cc drivers/xen/xenbus/xenbus_client.c

index cdacf92,cdacf92..aee192a
--- 1/drivers/xen/xenbus/xenbus_client.c
--- 2/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@@ -31,6 -31,6 +31,10 @@@
    */
   
   #include <linux/slab.h>
++#if defined(CONFIG_XEN) || defined(MODULE)
++#include <xen/evtchn.h>
++#include <xen/gnttab.h>
++#else
   #include <linux/types.h>
   #include <linux/vmalloc.h>
   #include <asm/xen/hypervisor.h>
@@@ -38,8 -38,8 +42,13 @@@
   #include <xen/interface/event_channel.h>
   #include <xen/events.h>
   #include <xen/grant_table.h>
++#endif
   #include <xen/xenbus.h>
   
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
   const char *xenbus_strstate(enum xenbus_state state)
   {
         static const char *const name[] = {
@@@ -49,9 -49,9 +58,9 @@@
                 [ XenbusStateInitialised  ] = "Initialised",
                 [ XenbusStateConnected    ] = "Connected",
                 [ XenbusStateClosing      ] = "Closing",
--              [ XenbusStateClosed       ] = "Closed",
--              [XenbusStateReconfiguring] = "Reconfiguring",
--              [XenbusStateReconfigured] = "Reconfigured",
++              [ XenbusStateClosed       ] = "Closed",
++              [ XenbusStateReconfiguring ] = "Reconfiguring",
++              [ XenbusStateReconfigured ] = "Reconfigured",
         };
         return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
   }
@@@ -94,6 -94,6 +103,26 @@@ int xenbus_watch_path(struct xenbus_dev
   EXPORT_SYMBOL_GPL(xenbus_watch_path);
   
   
++#if defined(CONFIG_XEN) || defined(MODULE)
++int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
++                     const char *path2, struct xenbus_watch *watch,
++                     void (*callback)(struct xenbus_watch *,
++                                      const char **, unsigned int))
++{
++      int err;
++      char *state = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/%s", path, path2);
++      if (!state) {
++              xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
++              return -ENOMEM;
++      }
++      err = xenbus_watch_path(dev, state, watch, callback);
++
++      if (err)
++              kfree(state);
++      return err;
++}
++EXPORT_SYMBOL_GPL(xenbus_watch_path2);
++#else
   /**
    * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path
    * @dev: xenbus device
@@@ -134,6 -134,6 +163,7 @@@ int xenbus_watch_pathfmt(struct xenbus_
         return err;
   }
   EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
++#endif
   
   static void xenbus_switch_fatal(struct xenbus_device *, int, int,
                                 const char *, ...);
@@@ -207,7 -207,7 +237,6 @@@ int xenbus_switch_state(struct xenbus_d
   {
         return __xenbus_switch_state(dev, state, 0);
   }
--
   EXPORT_SYMBOL_GPL(xenbus_switch_state);
   
   int xenbus_frontend_closed(struct xenbus_device *dev)
@@@ -228,41 -228,41 +257,23 @@@ static char *error_path(struct xenbus_d
   }
   
   
--static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
--                              const char *fmt, va_list ap)
++static void _dev_error(struct xenbus_device *dev, int err,
++                      const char *fmt, va_list *ap)
   {
--      int ret;
--      unsigned int len;
--      char *printf_buffer = NULL;
--      char *path_buffer = NULL;
++      char *printf_buffer, *path_buffer;
++      struct va_format vaf = { .fmt = fmt, .va = ap };
   
--#define PRINTF_BUFFER_SIZE 4096
--      printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
--      if (printf_buffer == NULL)
--              goto fail;
--
--      len = sprintf(printf_buffer, "%i ", -err);
--      ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
--
--      BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
--
--      dev_err(&dev->dev, "%s\n", printf_buffer);
++      printf_buffer = kasprintf(GFP_KERNEL, "%i %pV", -err, &vaf);
++      if (printf_buffer)
++              dev_err(&dev->dev, "%s\n", printf_buffer);
   
         path_buffer = error_path(dev);
++      if (!printf_buffer || !path_buffer
++          || xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer))
++              dev_err(&dev->dev,
++                      "xenbus: failed to write error node for %s (%s)\n",
++                      dev->nodename, printf_buffer);
   
--      if (path_buffer == NULL) {
--              dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
--                     dev->nodename, printf_buffer);
--              goto fail;
--      }
--
--      if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
--              dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
--                     dev->nodename, printf_buffer);
--              goto fail;
--      }
--
--fail:
         kfree(printf_buffer);
         kfree(path_buffer);
   }
@@@ -282,11 -282,11 +293,12 @@@ void xenbus_dev_error(struct xenbus_dev
         va_list ap;
   
         va_start(ap, fmt);
--      xenbus_va_dev_error(dev, err, fmt, ap);
++      _dev_error(dev, err, fmt, &ap);
         va_end(ap);
   }
   EXPORT_SYMBOL_GPL(xenbus_dev_error);
   
++
   /**
    * xenbus_dev_fatal
    * @dev: xenbus device
@@@ -297,13 -297,13 +309,12 @@@
    * xenbus_switch_state(dev, XenbusStateClosing) to schedule an orderly
    * closedown of this driver and its peer.
    */
--
   void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
   {
         va_list ap;
   
         va_start(ap, fmt);
--      xenbus_va_dev_error(dev, err, fmt, ap);
++      _dev_error(dev, err, fmt, &ap);
         va_end(ap);
   
         xenbus_switch_state(dev, XenbusStateClosing);
@@@ -320,7 -320,7 +331,7 @@@ static void xenbus_switch_fatal(struct 
         va_list ap;
   
         va_start(ap, fmt);
--      xenbus_va_dev_error(dev, err, fmt, ap);
++      _dev_error(dev, err, fmt, &ap);
         va_end(ap);
   
         if (!depth)
@@@ -331,7 -331,7 +342,7 @@@
    * xenbus_grant_ring
    * @dev: xenbus device
    * @ring_mfn: mfn of ring to grant
--
++ *
    * Grant access to the given @ring_mfn to the peer of the given device.  Return
    * 0 on success, or -errno on error.  On error, the device will switch to
    * XenbusStateClosing, and the error will be saved in the store.
@@@ -357,7 -357,7 +368,7 @@@ int xenbus_alloc_evtchn(struct xenbus_d
         struct evtchn_alloc_unbound alloc_unbound;
         int err;
   
--      alloc_unbound.dom = DOMID_SELF;
++      alloc_unbound.dom        = DOMID_SELF;
         alloc_unbound.remote_dom = dev->otherend_id;
   
         err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
@@@ -372,6 -372,6 +383,7 @@@
   EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
   
   
++#if 0 /* !defined(CONFIG_XEN) && !defined(MODULE) */
   /**
    * Bind to an existing interdomain event channel in another domain. Returns 0
    * on success and stores the local port in *port. On error, returns -errno,
@@@ -397,6 -397,6 +409,7 @@@ int xenbus_bind_evtchn(struct xenbus_de
         return err;
   }
   EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
++#endif
   
   
   /**
@@@ -418,6 -418,6 +431,7 @@@ int xenbus_free_evtchn(struct xenbus_de
   EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
   
   
++#if 0 /* !defined(CONFIG_XEN) && !defined(MODULE) */
   /**
    * xenbus_map_ring_valloc
    * @dev: xenbus device
@@@ -432,7 -432,7 +446,7 @@@
    * or -ENOMEM on error. If an error is returned, device will switch to
    * XenbusStateClosing and the error message will be saved in XenStore.
    */
--int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
++int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t gnt_ref, void **vaddr)
   {
         struct gnttab_map_grant_ref op = {
                 .flags = GNTMAP_host_map,
@@@ -483,7 -483,7 +497,7 @@@ EXPORT_SYMBOL_GPL(xenbus_map_ring_vallo
    * or -ENOMEM on error. If an error is returned, device will switch to
    * XenbusStateClosing and the error message will be saved in XenStore.
    */
--int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
++int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t gnt_ref,
                     grant_handle_t *handle, void *vaddr)
   {
         struct gnttab_map_grant_ref op = {
@@@ -592,6 -592,6 +606,7 @@@ int xenbus_unmap_ring(struct xenbus_dev
         return op.status;
   }
   EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
++#endif
   
   
   /**
diff --cc drivers/xen/xenbus/xenbus_comms.c

index 090c61e,090c61e..721fd08
--- 1/drivers/xen/xenbus/xenbus_comms.c
--- 2/drivers/xen/xenbus/xenbus_comms.c
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@@ -35,24 -35,24 +35,57 @@@
   #include <linux/sched.h>
   #include <linux/err.h>
   #include <xen/xenbus.h>
++#if defined(CONFIG_XEN) || defined(MODULE)
++#include <xen/evtchn.h>
++#include <asm/hypervisor.h>
++#else
   #include <asm/xen/hypervisor.h>
   #include <xen/events.h>
   #include <xen/page.h>
++#endif
++
   #include "xenbus_comms.h"
   
--static int xenbus_irq;
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
   
--static DECLARE_WORK(probe_work, xenbus_probe);
++static int xenbus_irq;
   
   static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
   
   static irqreturn_t wake_waiting(int irq, void *unused)
   {
--      if (unlikely(xenstored_ready == 0)) {
--              xenstored_ready = 1;
--              schedule_work(&probe_work);
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++      static DECLARE_WORK(probe_work, xenbus_probe);
++      int old, new;
++
++      old = atomic_read(&xenbus_xsd_state);
++      switch (old) {
++              case XENBUS_XSD_UNCOMMITTED:
++                      BUG();
++                      return IRQ_HANDLED;
++
++              case XENBUS_XSD_FOREIGN_INIT:
++                      new = XENBUS_XSD_FOREIGN_READY;
++                      break;
++
++              case XENBUS_XSD_LOCAL_INIT:
++                      new = XENBUS_XSD_LOCAL_READY;
++                      break;
++
++              case XENBUS_XSD_FOREIGN_READY:
++              case XENBUS_XSD_LOCAL_READY:
++              default:
++                      goto wake;
         }
   
++      old = atomic_cmpxchg(&xenbus_xsd_state, old, new);
++      if (old != new)
++              schedule_work(&probe_work);
++
++wake:
++#endif
         wake_up(&xb_waitq);
         return IRQ_HANDLED;
   }
@@@ -203,32 -203,32 +236,48 @@@ int xb_read(void *data, unsigned len
   int xb_init_comms(void)
   {
         struct xenstore_domain_interface *intf = xen_store_interface;
++      int err;
   
         if (intf->req_prod != intf->req_cons)
--              printk(KERN_ERR "XENBUS request ring is not quiescent "
++              pr_err("XENBUS request ring is not quiescent "
                        "(%08x:%08x)!\n", intf->req_cons, intf->req_prod);
   
         if (intf->rsp_prod != intf->rsp_cons) {
--              printk(KERN_WARNING "XENBUS response ring is not quiescent "
--                     "(%08x:%08x): fixing up\n",
--                     intf->rsp_cons, intf->rsp_prod);
--              intf->rsp_cons = intf->rsp_prod;
++              pr_warning("XENBUS response ring is not quiescent"
++                         " (%08x:%08x): fixing up\n",
++                         intf->rsp_cons, intf->rsp_prod);
++              /* breaks kdump */
++              if (!reset_devices)
++                      intf->rsp_cons = intf->rsp_prod;
         }
   
++#if defined(CONFIG_XEN) || defined(MODULE)
++      if (xenbus_irq)
++              unbind_from_irqhandler(xenbus_irq, &xb_waitq);
++
++      err = bind_caller_port_to_irqhandler(
++              xen_store_evtchn, wake_waiting,
++              0, "xenbus", &xb_waitq);
++      if (err <= 0) {
++              pr_err("XENBUS request irq failed %i\n", err);
++              return err;
++      }
++
++      xenbus_irq = err;
++#else
         if (xenbus_irq) {
                 /* Already have an irq; assume we're resuming */
                 rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
         } else {
                 err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
                                                 0, "xenbus", &xb_waitq);
                 if (err <= 0) {
--                      printk(KERN_ERR "XENBUS request irq failed %i\n", err);
++                      pr_err("XENBUS request irq failed %i\n", err);
                         return err;
                 }
--
                 xenbus_irq = err;
         }
++#endif
   
         return 0;
   }
diff --cc drivers/xen/xenbus/xenbus_comms.h

index c21db75,c21db75..4779c00
--- 1/drivers/xen/xenbus/xenbus_comms.h
--- 2/drivers/xen/xenbus/xenbus_comms.h
+++ b/drivers/xen/xenbus/xenbus_comms.h
@@@ -43,4 -43,4 +43,27 @@@ int xs_input_avail(void)
   extern struct xenstore_domain_interface *xen_store_interface;
   extern int xen_store_evtchn;
   
++/* For xenbus internal use. */
++enum {
++      XENBUS_XSD_UNCOMMITTED = 0,
++      XENBUS_XSD_FOREIGN_INIT,
++      XENBUS_XSD_FOREIGN_READY,
++      XENBUS_XSD_LOCAL_INIT,
++      XENBUS_XSD_LOCAL_READY,
++};
++extern atomic_t xenbus_xsd_state;
++
++static inline int is_xenstored_ready(void)
++{
++      int s = atomic_read(&xenbus_xsd_state);
++      return s == XENBUS_XSD_FOREIGN_READY || s == XENBUS_XSD_LOCAL_READY;
++}
++
++#if defined(CONFIG_XEN_XENBUS_DEV) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
++#include <xen/interface/event_channel.h>
++#include <xen/interface/grant_table.h>
++
++int xenbus_conn(domid_t, grant_ref_t *, evtchn_port_t *);
++#endif
++
   #endif /* _XENBUS_COMMS_H */
diff --cc drivers/xen/xenbus/xenbus_dev.c

index 0000000,0000000..cc48948

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_dev.c
@@@ -1,0 -1,0 +1,475 @@@
++/*
++ * xenbus_dev.c
++ * 
++ * Driver giving user-space access to the kernel's xenbus connection
++ * to xenstore.
++ * 
++ * Copyright (c) 2005, Christian Limpach
++ * Copyright (c) 2005, Rusty Russell, IBM Corporation
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/kernel.h>
++#include <linux/slab.h>
++#include <linux/errno.h>
++#include <linux/uio.h>
++#include <linux/notifier.h>
++#include <linux/sched.h>
++#include <linux/wait.h>
++#include <linux/fs.h>
++#include <linux/poll.h>
++#include <linux/mutex.h>
++
++#include "xenbus_comms.h"
++
++#include <asm/uaccess.h>
++#include <asm/hypervisor.h>
++#include <xen/xenbus.h>
++#include <xen/xen_proc.h>
++#include <asm/hypervisor.h>
++
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
++#include <xen/public/xenbus.h>
++
++struct xenbus_dev_transaction {
++      struct list_head list;
++      struct xenbus_transaction handle;
++};
++
++struct read_buffer {
++      struct list_head list;
++      unsigned int cons;
++      unsigned int len;
++      char msg[];
++};
++
++struct xenbus_dev_data {
++      /* In-progress transaction. */
++      struct list_head transactions;
++
++      /* Active watches. */
++      struct list_head watches;
++
++      /* Partial request. */
++      unsigned int len;
++      union {
++              struct xsd_sockmsg msg;
++              char buffer[PAGE_SIZE];
++      } u;
++
++      /* Response queue. */
++      struct list_head read_buffers;
++      wait_queue_head_t read_waitq;
++
++      struct mutex reply_mutex;
++};
++
++static struct proc_dir_entry *xenbus_dev_intf;
++
++static ssize_t xenbus_dev_read(struct file *filp,
++                             char __user *ubuf,
++                             size_t len, loff_t *ppos)
++{
++      struct xenbus_dev_data *u = filp->private_data;
++      struct read_buffer *rb;
++      int i, ret;
++
++      if (!is_xenstored_ready())
++              return -ENODEV;
++
++      mutex_lock(&u->reply_mutex);
++      while (list_empty(&u->read_buffers)) {
++              mutex_unlock(&u->reply_mutex);
++              if (filp->f_flags & O_NONBLOCK)
++                      return -EAGAIN;
++
++              ret = wait_event_interruptible(u->read_waitq,
++                                             !list_empty(&u->read_buffers));
++              if (ret)
++                      return ret;
++              mutex_lock(&u->reply_mutex);
++      }
++
++      rb = list_entry(u->read_buffers.next, struct read_buffer, list);
++      for (i = 0; i < len;) {
++              put_user(rb->msg[rb->cons], ubuf + i);
++              i++;
++              rb->cons++;
++              if (rb->cons == rb->len) {
++                      list_del(&rb->list);
++                      kfree(rb);
++                      if (list_empty(&u->read_buffers))
++                              break;
++                      rb = list_entry(u->read_buffers.next,
++                                      struct read_buffer, list);
++              }
++      }
++      mutex_unlock(&u->reply_mutex);
++
++      return i;
++}
++
++static void queue_reply(struct xenbus_dev_data *u,
++                      char *data, unsigned int len)
++{
++      struct read_buffer *rb;
++
++      if (len == 0)
++              return;
++
++      rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
++      BUG_ON(rb == NULL);
++
++      rb->cons = 0;
++      rb->len = len;
++
++      memcpy(rb->msg, data, len);
++
++      list_add_tail(&rb->list, &u->read_buffers);
++
++      wake_up(&u->read_waitq);
++}
++
++struct watch_adapter
++{
++      struct list_head list;
++      struct xenbus_watch watch;
++      struct xenbus_dev_data *dev_data;
++      char *token;
++};
++
++static void free_watch_adapter (struct watch_adapter *watch)
++{
++      kfree(watch->watch.node);
++      kfree(watch->token);
++      kfree(watch);
++}
++
++static void watch_fired(struct xenbus_watch *watch,
++                      const char **vec,
++                      unsigned int len)
++{
++      struct watch_adapter *adap =
++            container_of(watch, struct watch_adapter, watch);
++      struct xsd_sockmsg hdr;
++      const char *path, *token;
++      int path_len, tok_len, body_len, data_len = 0;
++
++      path = vec[XS_WATCH_PATH];
++      token = adap->token;
++
++      path_len = strlen(path) + 1;
++      tok_len = strlen(token) + 1;
++      if (len > 2)
++              data_len = vec[len] - vec[2] + 1;
++      body_len = path_len + tok_len + data_len;
++
++      hdr.type = XS_WATCH_EVENT;
++      hdr.len = body_len;
++
++      mutex_lock(&adap->dev_data->reply_mutex);
++      queue_reply(adap->dev_data, (char *)&hdr, sizeof(hdr));
++      queue_reply(adap->dev_data, (char *)path, path_len);
++      queue_reply(adap->dev_data, (char *)token, tok_len);
++      if (len > 2)
++              queue_reply(adap->dev_data, (char *)vec[2], data_len);
++      mutex_unlock(&adap->dev_data->reply_mutex);
++}
++
++static LIST_HEAD(watch_list);
++
++static ssize_t xenbus_dev_write(struct file *filp,
++                              const char __user *ubuf,
++                              size_t len, loff_t *ppos)
++{
++      struct xenbus_dev_data *u = filp->private_data;
++      struct xenbus_dev_transaction *trans = NULL;
++      uint32_t msg_type;
++      void *reply;
++      char *path, *token;
++      struct watch_adapter *watch, *tmp_watch;
++      int err, rc = len;
++
++      if (!is_xenstored_ready())
++              return -ENODEV;
++
++      if ((len + u->len) > sizeof(u->u.buffer)) {
++              rc = -EINVAL;
++              goto out;
++      }
++
++      if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0) {
++              rc = -EFAULT;
++              goto out;
++      }
++
++      u->len += len;
++      if ((u->len < sizeof(u->u.msg)) ||
++          (u->len < (sizeof(u->u.msg) + u->u.msg.len)))
++              return rc;
++
++      msg_type = u->u.msg.type;
++
++      switch (msg_type) {
++      case XS_WATCH:
++      case XS_UNWATCH: {
++              static const char *XS_RESP = "OK";
++              struct xsd_sockmsg hdr;
++
++              path = u->u.buffer + sizeof(u->u.msg);
++              token = memchr(path, 0, u->u.msg.len);
++              if (token == NULL) {
++                      rc = -EILSEQ;
++                      goto out;
++              }
++              token++;
++
++              if (msg_type == XS_WATCH) {
++                      watch = kzalloc(sizeof(*watch), GFP_KERNEL);
++                      watch->watch.node = kmalloc(strlen(path)+1,
++                                                    GFP_KERNEL);
++                      strcpy((char *)watch->watch.node, path);
++                      watch->watch.callback = watch_fired;
++                      watch->token = kmalloc(strlen(token)+1, GFP_KERNEL);
++                      strcpy(watch->token, token);
++                      watch->dev_data = u;
++
++                      err = register_xenbus_watch(&watch->watch);
++                      if (err) {
++                              free_watch_adapter(watch);
++                              rc = err;
++                              goto out;
++                      }
++                      
++                      list_add(&watch->list, &u->watches);
++              } else {
++                      list_for_each_entry_safe(watch, tmp_watch,
++                                                 &u->watches, list) {
++                              if (!strcmp(watch->token, token) &&
++                                  !strcmp(watch->watch.node, path))
++                              {
++                                      unregister_xenbus_watch(&watch->watch);
++                                      list_del(&watch->list);
++                                      free_watch_adapter(watch);
++                                      break;
++                              }
++                      }
++              }
++
++              hdr.type = msg_type;
++              hdr.len = strlen(XS_RESP) + 1;
++              mutex_lock(&u->reply_mutex);
++              queue_reply(u, (char *)&hdr, sizeof(hdr));
++              queue_reply(u, (char *)XS_RESP, hdr.len);
++              mutex_unlock(&u->reply_mutex);
++              break;
++      }
++
++      default:
++              if (msg_type == XS_TRANSACTION_START) {
++                      trans = kmalloc(sizeof(*trans), GFP_KERNEL);
++                      if (!trans) {
++                              rc = -ENOMEM;
++                              goto out;
++                      }
++              }
++
++              reply = xenbus_dev_request_and_reply(&u->u.msg);
++              if (IS_ERR(reply)) {
++                      kfree(trans);
++                      rc = PTR_ERR(reply);
++                      goto out;
++              }
++
++              if (msg_type == XS_TRANSACTION_START) {
++                      trans->handle.id = simple_strtoul(reply, NULL, 0);
++                      list_add(&trans->list, &u->transactions);
++              } else if (msg_type == XS_TRANSACTION_END) {
++                      list_for_each_entry(trans, &u->transactions, list)
++                              if (trans->handle.id == u->u.msg.tx_id)
++                                      break;
++                      BUG_ON(&trans->list == &u->transactions);
++                      list_del(&trans->list);
++                      kfree(trans);
++              }
++              mutex_lock(&u->reply_mutex);
++              queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
++              queue_reply(u, (char *)reply, u->u.msg.len);
++              mutex_unlock(&u->reply_mutex);
++              kfree(reply);
++              break;
++      }
++
++ out:
++      u->len = 0;
++      return rc;
++}
++
++static int xenbus_dev_open(struct inode *inode, struct file *filp)
++{
++      struct xenbus_dev_data *u;
++
++      if (xen_store_evtchn == 0)
++              return -ENOENT;
++
++      nonseekable_open(inode, filp);
++
++      u = kzalloc(sizeof(*u), GFP_KERNEL);
++      if (u == NULL)
++              return -ENOMEM;
++
++      INIT_LIST_HEAD(&u->transactions);
++      INIT_LIST_HEAD(&u->watches);
++      INIT_LIST_HEAD(&u->read_buffers);
++      init_waitqueue_head(&u->read_waitq);
++
++      mutex_init(&u->reply_mutex);
++
++      filp->private_data = u;
++
++      return 0;
++}
++
++static int xenbus_dev_release(struct inode *inode, struct file *filp)
++{
++      struct xenbus_dev_data *u = filp->private_data;
++      struct xenbus_dev_transaction *trans, *tmp;
++      struct watch_adapter *watch, *tmp_watch;
++      struct read_buffer *rb, *tmp_rb;
++
++      list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
++              xenbus_transaction_end(trans->handle, 1);
++              list_del(&trans->list);
++              kfree(trans);
++      }
++
++      list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
++              unregister_xenbus_watch(&watch->watch);
++              list_del(&watch->list);
++              free_watch_adapter(watch);
++      }
++
++      list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
++              list_del(&rb->list);
++              kfree(rb);
++      }
++      kfree(u);
++
++      return 0;
++}
++
++static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait)
++{
++      struct xenbus_dev_data *u = file->private_data;
++
++      if (!is_xenstored_ready())
++              return -ENODEV;
++
++      poll_wait(file, &u->read_waitq, wait);
++      if (!list_empty(&u->read_buffers))
++              return POLLIN | POLLRDNORM;
++      return 0;
++}
++
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++static long xenbus_dev_ioctl(struct file *file,
++                             unsigned int cmd, unsigned long data)
++{
++      void __user *udata = (void __user *) data;
++      int ret = -ENOTTY;
++      
++      if (!is_initial_xendomain())
++              return -ENODEV;
++
++
++      switch (cmd) {
++      case IOCTL_XENBUS_ALLOC: {
++              xenbus_alloc_t xa;
++              int old;
++
++              old = atomic_cmpxchg(&xenbus_xsd_state,
++                                   XENBUS_XSD_UNCOMMITTED,
++                                   XENBUS_XSD_FOREIGN_INIT);
++              if (old != XENBUS_XSD_UNCOMMITTED)
++                      return -EBUSY;
++
++              if (copy_from_user(&xa, udata, sizeof(xa))) {
++                      ret = -EFAULT;
++                      atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED);
++                      break;
++              }
++
++              ret = xenbus_conn(xa.dom, &xa.grant_ref, &xa.port);
++              if (ret != 0) {
++                      atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED);
++                      break;
++              }
++
++              if (copy_to_user(udata, &xa, sizeof(xa))) {
++                      ret = -EFAULT;
++                      atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED);
++                      break;
++              }
++      }
++      break;
++
++      default:
++              break;
++      }
++
++      return ret;
++}
++#endif
++
++static const struct file_operations xenbus_dev_file_ops = {
++      .read = xenbus_dev_read,
++      .write = xenbus_dev_write,
++      .open = xenbus_dev_open,
++      .release = xenbus_dev_release,
++      .llseek = no_llseek,
++      .poll = xenbus_dev_poll,
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++      .unlocked_ioctl = xenbus_dev_ioctl
++#endif
++};
++
++int
++#ifndef MODULE
++__init
++#else
++__devinit
++#endif
++xenbus_dev_init(void)
++{
++      xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400);
++      if (xenbus_dev_intf)
++              xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops;
++
++      return 0;
++}
diff --cc drivers/xen/xenbus/xenbus_probe.c

index 7397695,7397695..760fc62
--- 1/drivers/xen/xenbus/xenbus_probe.c
--- 2/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@@ -4,6 -4,6 +4,7 @@@
    * Copyright (C) 2005 Rusty Russell, IBM Corporation
    * Copyright (C) 2005 Mike Wray, Hewlett-Packard
    * Copyright (C) 2005, 2006 XenSource Ltd
++ * Copyright (C) 2007 Solarflare Communications, Inc.
    *
    * This program is free software; you can redistribute it and/or
    * modify it under the terms of the GNU General Public License version 2
@@@ -32,23 -32,23 +33,34 @@@
   
   #define DPRINTK(fmt, args...)                         \
         pr_debug("xenbus_probe (%s:%d) " fmt ".\n",     \
--               __func__, __LINE__, ##args)
++               __FUNCTION__, __LINE__, ##args)
   
   #include <linux/kernel.h>
++#include <linux/version.h>
   #include <linux/err.h>
   #include <linux/string.h>
   #include <linux/ctype.h>
   #include <linux/fcntl.h>
   #include <linux/mm.h>
++#include <linux/sched.h>
   #include <linux/proc_fs.h>
   #include <linux/notifier.h>
--#include <linux/kthread.h>
   #include <linux/mutex.h>
   #include <linux/io.h>
   #include <linux/slab.h>
   
   #include <asm/page.h>
   #include <asm/pgtable.h>
++#if defined(CONFIG_XEN) || defined(MODULE)
++#include <asm/hypervisor.h>
++#include <xen/xenbus.h>
++#include <xen/xen_proc.h>
++#include <xen/evtchn.h>
++#include <xen/features.h>
++#include <xen/gnttab.h>
++
++#define PARAVIRT_EXPORT_SYMBOL(sym) __typeof__(sym) sym
++#else
   #include <asm/xen/hypervisor.h>
   
   #include <xen/xen.h>
@@@ -56,21 -56,21 +68,39 @@@
   #include <xen/events.h>
   #include <xen/page.h>
   
++#define PARAVIRT_EXPORT_SYMBOL EXPORT_SYMBOL_GPL
++#endif
++
++#ifndef CONFIG_XEN
   #include <xen/hvm.h>
++#endif
   
   #include "xenbus_comms.h"
   #include "xenbus_probe.h"
   
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
   
   int xen_store_evtchn;
--EXPORT_SYMBOL_GPL(xen_store_evtchn);
++PARAVIRT_EXPORT_SYMBOL(xen_store_evtchn);
   
   struct xenstore_domain_interface *xen_store_interface;
--EXPORT_SYMBOL_GPL(xen_store_interface);
++PARAVIRT_EXPORT_SYMBOL(xen_store_interface);
   
   static unsigned long xen_store_mfn;
   
--static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
++extern struct mutex xenwatch_mutex;
++
++static
++#ifdef CONFIG_XEN_UNPRIVILEGED_GUEST
++__initdata
++#endif
++BLOCKING_NOTIFIER_HEAD(xenstore_chain);
++
++#if defined(CONFIG_XEN) || defined(MODULE)
++static void wait_for_devices(struct xenbus_driver *xendrv);
++#endif
   
   /* If something in array of ids matches this device, return it. */
   static const struct xenbus_device_id *
@@@ -92,7 -92,7 +122,7 @@@ int xenbus_match(struct device *_dev, s
   
         return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
   }
--EXPORT_SYMBOL_GPL(xenbus_match);
++PARAVIRT_EXPORT_SYMBOL(xenbus_match);
   
   
   static void free_otherend_details(struct xenbus_device *dev)
@@@ -112,29 -112,29 +142,6 @@@ static void free_otherend_watch(struct 
   }
   
   
--static int talk_to_otherend(struct xenbus_device *dev)
--{
--      struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
--
--      free_otherend_watch(dev);
--      free_otherend_details(dev);
--
--      return drv->read_otherend_details(dev);
--}
--
--
--
--static int watch_otherend(struct xenbus_device *dev)
--{
--      struct xen_bus_type *bus =
--              container_of(dev->dev.bus, struct xen_bus_type, bus);
--
--      return xenbus_watch_pathfmt(dev, &dev->otherend_watch,
--                                  bus->otherend_changed,
--                                  "%s/%s", dev->otherend, "state");
--}
--
--
   int xenbus_read_otherend_details(struct xenbus_device *xendev,
                                  char *id_node, char *path_node)
   {
@@@ -160,11 -160,11 +167,22 @@@
   
         return 0;
   }
--EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
++PARAVIRT_EXPORT_SYMBOL(xenbus_read_otherend_details);
++
++#if defined(CONFIG_XEN) || defined(MODULE)
++
++static int read_backend_details(struct xenbus_device *xendev)
++{
++      return xenbus_read_otherend_details(xendev, "backend-id", "backend");
++}
   
++static void otherend_changed(struct xenbus_watch *watch,
++                           const char **vec, unsigned int len)
++#else /* !CONFIG_XEN && !MODULE */
   void xenbus_otherend_changed(struct xenbus_watch *watch,
                              const char **vec, unsigned int len,
                              int ignore_on_shutdown)
++#endif /* CONFIG_XEN || MODULE */
   {
         struct xenbus_device *dev =
                 container_of(watch, struct xenbus_device, otherend_watch);
@@@ -176,31 -176,31 +194,69 @@@
         if (!dev->otherend ||
             strncmp(dev->otherend, vec[XS_WATCH_PATH],
                     strlen(dev->otherend))) {
--              dev_dbg(&dev->dev, "Ignoring watch at %s\n",
--                      vec[XS_WATCH_PATH]);
++              dev_dbg(&dev->dev, "Ignoring watch at %s", vec[XS_WATCH_PATH]);
                 return;
         }
   
         state = xenbus_read_driver_state(dev->otherend);
   
--      dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n",
++      dev_dbg(&dev->dev, "state is %d (%s), %s, %s",
                 state, xenbus_strstate(state), dev->otherend_watch.node,
                 vec[XS_WATCH_PATH]);
   
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
         /*
          * Ignore xenbus transitions during shutdown. This prevents us doing
          * work that can fail e.g., when the rootfs is gone.
          */
         if (system_state > SYSTEM_RUNNING) {
++              /* If we're frontend, drive the state machine to Closed. */
++              /* This should cause the backend to release our resources. */
++# if defined(CONFIG_XEN) || defined(MODULE)
++              const struct xen_bus_type *bus =
++                      container_of(dev->dev.bus, struct xen_bus_type, bus);
++              int ignore_on_shutdown = (bus->levels == 2);
++# endif
++
                 if (ignore_on_shutdown && (state == XenbusStateClosing))
                         xenbus_frontend_closed(dev);
                 return;
         }
++#endif
   
         if (drv->otherend_changed)
                 drv->otherend_changed(dev, state);
   }
--EXPORT_SYMBOL_GPL(xenbus_otherend_changed);
++PARAVIRT_EXPORT_SYMBOL(xenbus_otherend_changed);
++
++
++static int talk_to_otherend(struct xenbus_device *dev)
++{
++      struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
++
++      free_otherend_watch(dev);
++      free_otherend_details(dev);
++
++      return drv->read_otherend_details(dev);
++}
++
++
++
++static int watch_otherend(struct xenbus_device *dev)
++{
++#if defined(CONFIG_XEN) || defined(MODULE)
++      return xenbus_watch_path2(dev, dev->otherend, "state",
++                                &dev->otherend_watch, otherend_changed);
++#else
++      struct xen_bus_type *bus =
++              container_of(dev->dev.bus, struct xen_bus_type, bus);
++
++      return xenbus_watch_pathfmt(dev, &dev->otherend_watch,
++                                  bus->otherend_changed,
++                                  "%s/%s", dev->otherend, "state");
++#endif
++}
++
   
   int xenbus_dev_probe(struct device *_dev)
   {
@@@ -224,8 -224,8 +280,9 @@@
   
         err = talk_to_otherend(dev);
         if (err) {
--              dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n",
--                       dev->nodename);
++              dev_warn(&dev->dev,
++                       "xenbus_probe: talk_to_otherend on %s failed.\n",
++                       dev->nodename);
                 return err;
         }
   
@@@ -235,8 -235,8 +292,9 @@@
   
         err = watch_otherend(dev);
         if (err) {
--              dev_warn(&dev->dev, "watch_otherend on %s failed.\n",
--                     dev->nodename);
++              dev_warn(&dev->dev,
++                       "xenbus_probe: watch_otherend on %s failed.\n",
++                       dev->nodename);
                 return err;
         }
   
@@@ -244,9 -244,9 +302,13 @@@
   fail:
         xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
         xenbus_switch_state(dev, XenbusStateClosed);
++#if defined(CONFIG_XEN) || defined(MODULE)
++      return -ENODEV;
++#else
         return err;
++#endif
   }
--EXPORT_SYMBOL_GPL(xenbus_dev_probe);
++PARAVIRT_EXPORT_SYMBOL(xenbus_dev_probe);
   
   int xenbus_dev_remove(struct device *_dev)
   {
@@@ -264,7 -264,7 +326,7 @@@
         xenbus_switch_state(dev, XenbusStateClosed);
         return 0;
   }
--EXPORT_SYMBOL_GPL(xenbus_dev_remove);
++PARAVIRT_EXPORT_SYMBOL(xenbus_dev_remove);
   
   void xenbus_dev_shutdown(struct device *_dev)
   {
@@@ -273,35 -273,35 +335,64 @@@
   
         DPRINTK("%s", dev->nodename);
   
++/* Commented out since xenstored stubdom is now minios based not linux based
++#define XENSTORE_DOMAIN_SHARES_THIS_KERNEL
++*/
++#ifndef XENSTORE_DOMAIN_SHARES_THIS_KERNEL
++      if (is_initial_xendomain())
++#endif
++              return;
++
         get_device(&dev->dev);
         if (dev->state != XenbusStateConnected) {
--              printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__,
--                     dev->nodename, xenbus_strstate(dev->state));
++              dev_info(&dev->dev, "%s: %s: %s != Connected, skipping\n", __FUNCTION__,
++                       dev->nodename, xenbus_strstate(dev->state));
                 goto out;
         }
         xenbus_switch_state(dev, XenbusStateClosing);
++
++      if (!strcmp(dev->devicetype, "vfb"))
++              goto out;
++
         timeout = wait_for_completion_timeout(&dev->down, timeout);
         if (!timeout)
--              printk(KERN_INFO "%s: %s timeout closing device\n",
--                     __func__, dev->nodename);
++              dev_info(&dev->dev, "%s: %s timeout closing device\n",
++                       __FUNCTION__, dev->nodename);
    out:
         put_device(&dev->dev);
   }
--EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
++PARAVIRT_EXPORT_SYMBOL(xenbus_dev_shutdown);
   
   int xenbus_register_driver_common(struct xenbus_driver *drv,
                                   struct xen_bus_type *bus,
                                   struct module *owner,
                                   const char *mod_name)
   {
++      int ret;
++
++      if (bus->error)
++              return bus->error;
++
         drv->driver.name = drv->name;
         drv->driver.bus = &bus->bus;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
         drv->driver.owner = owner;
++#endif
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,21)
         drv->driver.mod_name = mod_name;
++#endif
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
++      drv->driver.probe = xenbus_dev_probe;
++      drv->driver.remove = xenbus_dev_remove;
++      drv->driver.shutdown = xenbus_dev_shutdown;
++#endif
   
--      return driver_register(&drv->driver);
++      mutex_lock(&xenwatch_mutex);
++      ret = driver_register(&drv->driver);
++      mutex_unlock(&xenwatch_mutex);
++      return ret;
   }
--EXPORT_SYMBOL_GPL(xenbus_register_driver_common);
++PARAVIRT_EXPORT_SYMBOL(xenbus_register_driver_common);
   
   void xenbus_unregister_driver(struct xenbus_driver *drv)
   {
@@@ -379,21 -379,21 +470,30 @@@ static void xenbus_dev_release(struct d
   }
   
   static ssize_t xendev_show_nodename(struct device *dev,
--                                  struct device_attribute *attr, char *buf)
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
++                                  struct device_attribute *attr,
++#endif
++                                  char *buf)
   {
         return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
   }
   static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
   
   static ssize_t xendev_show_devtype(struct device *dev,
--                                 struct device_attribute *attr, char *buf)
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
++                                 struct device_attribute *attr,
++#endif
++                                 char *buf)
   {
         return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
   }
   static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
   
   static ssize_t xendev_show_modalias(struct device *dev,
--                                  struct device_attribute *attr, char *buf)
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
++                                  struct device_attribute *attr,
++#endif
++                                  char *buf)
   {
         return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
   }
@@@ -403,7 -403,7 +503,6 @@@ int xenbus_probe_node(struct xen_bus_ty
                       const char *type,
                       const char *nodename)
   {
--      char devname[XEN_BUS_ID_SIZE];
         int err;
         struct xenbus_device *xendev;
         size_t stringlen;
@@@ -411,6 -411,6 +510,9 @@@
   
         enum xenbus_state state = xenbus_read_driver_state(nodename);
   
++      if (bus->error)
++              return bus->error;
++
         if (state != XenbusStateInitialising) {
                 /* Device is not new, so ignore it.  This can happen if a
                    device is going away after switching to Closed.  */
@@@ -435,15 -435,15 +537,26 @@@
         xendev->devicetype = tmpstring;
         init_completion(&xendev->down);
   
++#if defined(CONFIG_XEN) || defined(MODULE)
++      xendev->dev.parent = &bus->dev;
++#endif
         xendev->dev.bus = &bus->bus;
         xendev->dev.release = xenbus_dev_release;
   
--      err = bus->get_bus_id(devname, xendev->nodename);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
++      {
++              char devname[XEN_BUS_ID_SIZE];
++
++              err = bus->get_bus_id(devname, xendev->nodename);
++              if (!err)
++                      dev_set_name(&xendev->dev, devname);
++      }
++#else
++      err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
++#endif
         if (err)
                 goto fail;
   
--      dev_set_name(&xendev->dev, devname);
--
         /* Register with generic device framework. */
         err = device_register(&xendev->dev);
         if (err)
@@@ -472,7 -472,7 +585,121 @@@ fail
         kfree(xendev);
         return err;
   }
--EXPORT_SYMBOL_GPL(xenbus_probe_node);
++PARAVIRT_EXPORT_SYMBOL(xenbus_probe_node);
++
++#if defined(CONFIG_XEN) || defined(MODULE)
++
++/* device/<type>/<id> => <type>-<id> */
++static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
++{
++      nodename = strchr(nodename, '/');
++      if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
++              pr_warning("XENBUS: bad frontend %s\n", nodename);
++              return -EINVAL;
++      }
++
++      strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
++      if (!strchr(bus_id, '/')) {
++              pr_warning("XENBUS: bus_id %s no slash\n", bus_id);
++              return -EINVAL;
++      }
++      *strchr(bus_id, '/') = '-';
++      return 0;
++}
++
++/* device/<typename>/<name> */
++static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type,
++                               const char *name)
++{
++      char *nodename;
++      int err;
++
++      if (!strcmp(type, "console"))
++              return 0;
++
++      nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name);
++      if (!nodename)
++              return -ENOMEM;
++
++      DPRINTK("%s", nodename);
++
++      err = xenbus_probe_node(bus, type, nodename);
++      kfree(nodename);
++      return err;
++}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
++static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env)
++{
++      struct xenbus_device *xdev;
++
++      if (dev == NULL)
++              return -ENODEV;
++      xdev = to_xenbus_device(dev);
++      if (xdev == NULL)
++              return -ENODEV;
++
++      /* stuff we want to pass to /sbin/hotplug */
++      if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype) ||
++          add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename) ||
++          add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype))
++              return -ENOMEM;
++
++      return 0;
++}
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
++static struct device_attribute xenbus_dev_attrs[] = {
++      __ATTR_NULL
++};
++#endif
++
++/* Bus type for frontend drivers. */
++static struct xen_bus_type xenbus_frontend = {
++      .root = "device",
++      .levels = 2,            /* device/type/<id> */
++      .get_bus_id = frontend_bus_id,
++      .probe = xenbus_probe_frontend,
++      .error = -ENODEV,
++      .bus = {
++              .name      = "xen",
++              .match     = xenbus_match,
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
++              .probe     = xenbus_dev_probe,
++              .remove    = xenbus_dev_remove,
++              .shutdown  = xenbus_dev_shutdown,
++              .uevent    = xenbus_uevent_frontend,
++#endif
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
++              .dev_attrs = xenbus_dev_attrs,
++#endif
++      },
++      .dev = {
++              .init_name = "xen",
++      },
++};
++
++int __xenbus_register_frontend(struct xenbus_driver *drv,
++                             struct module *owner, const char *mod_name)
++{
++      int ret;
++
++      drv->read_otherend_details = read_backend_details;
++
++      ret = xenbus_register_driver_common(drv, &xenbus_frontend,
++                                          owner, mod_name);
++      if (ret)
++              return ret;
++
++      /* If this driver is loaded as a module wait for devices to attach. */
++      wait_for_devices(drv);
++
++      return 0;
++}
++EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
++
++#endif
   
   static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
   {
@@@ -501,6 -501,6 +728,9 @@@ int xenbus_probe_devices(struct xen_bus
         char **dir;
         unsigned int i, dir_n;
   
++      if (bus->error)
++              return bus->error;
++
         dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
         if (IS_ERR(dir))
                 return PTR_ERR(dir);
@@@ -514,7 -514,7 +744,7 @@@
         kfree(dir);
         return err;
   }
--EXPORT_SYMBOL_GPL(xenbus_probe_devices);
++PARAVIRT_EXPORT_SYMBOL(xenbus_probe_devices);
   
   static unsigned int char_count(const char *str, char c)
   {
@@@ -546,7 -546,7 +776,7 @@@ void xenbus_dev_changed(const char *nod
         char type[XEN_BUS_ID_SIZE];
         const char *p, *root;
   
--      if (char_count(node, '/') < 2)
++      if (bus->error || char_count(node, '/') < 2)
                 return;
   
         exists = xenbus_exists(XBT_NIL, node, "");
@@@ -575,9 -575,9 +805,27 @@@
   
         kfree(root);
   }
--EXPORT_SYMBOL_GPL(xenbus_dev_changed);
++PARAVIRT_EXPORT_SYMBOL(xenbus_dev_changed);
++
++#if defined(CONFIG_XEN) || defined(MODULE)
++static void frontend_changed(struct xenbus_watch *watch,
++                           const char **vec, unsigned int len)
++{
++      DPRINTK("");
++
++      xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
++}
   
++/* We watch for devices appearing and vanishing. */
++static struct xenbus_watch fe_watch = {
++      .node = "device",
++      .callback = frontend_changed,
++};
++
++static int suspend_dev(struct device *dev, void *data)
++#else
   int xenbus_dev_suspend(struct device *dev)
++#endif
   {
         int err = 0;
         struct xenbus_driver *drv;
@@@ -592,13 -592,13 +840,37 @@@
         if (drv->suspend)
                 err = drv->suspend(xdev);
         if (err)
--              printk(KERN_WARNING
--                     "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
++              pr_warning("xenbus: suspend %s failed: %i\n",
++                         dev_name(dev), err);
         return 0;
   }
--EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
++PARAVIRT_EXPORT_SYMBOL(xenbus_dev_suspend);
   
++#if defined(CONFIG_XEN) || defined(MODULE)
++static int suspend_cancel_dev(struct device *dev, void *data)
++{
++      int err = 0;
++      struct xenbus_driver *drv;
++      struct xenbus_device *xdev;
++
++      DPRINTK("");
++
++      if (dev->driver == NULL)
++              return 0;
++      drv = to_xenbus_driver(dev->driver);
++      xdev = container_of(dev, struct xenbus_device, dev);
++      if (drv->suspend_cancel)
++              err = drv->suspend_cancel(xdev);
++      if (err)
++              pr_warning("xenbus: suspend_cancel %s failed: %i\n",
++                         dev_name(dev), err);
++      return 0;
++}
++
++static int resume_dev(struct device *dev, void *data)
++#else
   int xenbus_dev_resume(struct device *dev)
++#endif
   {
         int err;
         struct xenbus_driver *drv;
@@@ -612,9 -612,9 +884,8 @@@
         drv = to_xenbus_driver(dev->driver);
         err = talk_to_otherend(xdev);
         if (err) {
--              printk(KERN_WARNING
--                     "xenbus: resume (talk_to_otherend) %s failed: %i\n",
--                     dev_name(dev), err);
++              pr_warning("xenbus: resume (talk_to_otherend) %s failed: %i\n",
++                         dev_name(dev), err);
                 return err;
         }
   
@@@ -623,48 -623,48 +894,80 @@@
         if (drv->resume) {
                 err = drv->resume(xdev);
                 if (err) {
--                      printk(KERN_WARNING
--                             "xenbus: resume %s failed: %i\n",
--                             dev_name(dev), err);
++                      pr_warning("xenbus: resume %s failed: %i\n",
++                                 dev_name(dev), err);
                         return err;
                 }
         }
   
         err = watch_otherend(xdev);
         if (err) {
--              printk(KERN_WARNING
--                     "xenbus_probe: resume (watch_otherend) %s failed: "
--                     "%d.\n", dev_name(dev), err);
++              pr_warning("xenbus_probe: resume (watch_otherend) %s failed:"
++                         " %d\n", dev_name(dev), err);
                 return err;
         }
   
         return 0;
   }
--EXPORT_SYMBOL_GPL(xenbus_dev_resume);
++PARAVIRT_EXPORT_SYMBOL(xenbus_dev_resume);
   
++#if !defined(CONFIG_XEN) && !defined(MODULE)
   int xenbus_dev_cancel(struct device *dev)
   {
         /* Do nothing */
         DPRINTK("cancel");
         return 0;
   }
--EXPORT_SYMBOL_GPL(xenbus_dev_cancel);
++PARAVIRT_EXPORT_SYMBOL(xenbus_dev_cancel);
++#else
++void xenbus_suspend(void)
++{
++      DPRINTK("");
++
++      if (!xenbus_frontend.error)
++              bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
++      xenbus_backend_suspend(suspend_dev);
++      xs_suspend();
++}
++
++void xenbus_resume(void)
++{
++      xb_init_comms();
++      xs_resume();
++      if (!xenbus_frontend.error)
++              bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
++      xenbus_backend_resume(resume_dev);
++}
++
++void xenbus_suspend_cancel(void)
++{
++      xs_suspend_cancel();
++      if (!xenbus_frontend.error)
++              bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
++      xenbus_backend_resume(suspend_cancel_dev);
++}
++#endif
   
   /* A flag to determine if xenstored is 'ready' (i.e. has started) */
--int xenstored_ready = 0;
++atomic_t xenbus_xsd_state = ATOMIC_INIT(XENBUS_XSD_UNCOMMITTED);
   
   
--int register_xenstore_notifier(struct notifier_block *nb)
++int
++#ifdef CONFIG_XEN
++__init
++#endif
++register_xenstore_notifier(struct notifier_block *nb)
   {
         int ret = 0;
   
--      if (xenstored_ready > 0)
++      if (is_xenstored_ready())
                 ret = nb->notifier_call(nb, 0, NULL);
         else
                 blocking_notifier_chain_register(&xenstore_chain, nb);
   
         return ret;
   }
++#ifndef CONFIG_XEN
   EXPORT_SYMBOL_GPL(register_xenstore_notifier);
   
   void unregister_xenstore_notifier(struct notifier_block *nb)
@@@ -672,16 -672,16 +975,128 @@@
         blocking_notifier_chain_unregister(&xenstore_chain, nb);
   }
   EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
++#endif
++
++#ifdef CONFIG_CRASH_DUMP
++static DECLARE_WAIT_QUEUE_HEAD(be_state_wq);
++static int be_state;
++
++static void xenbus_reset_state_changed(struct xenbus_watch *w, const char **v, unsigned int l)
++{
++      xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &be_state);
++      printk(KERN_INFO "XENBUS: %s %s\n", v[XS_WATCH_PATH], xenbus_strstate(be_state));
++      wake_up(&be_state_wq);
++}
++
++static int xenbus_reset_check_final(int *st)
++{
++      return *st == XenbusStateInitialising || *st == XenbusStateInitWait;
++}
++
++static void xenbus_reset_frontend_state(char *backend, char *frontend)
++{
++      struct xenbus_watch watch;
++
++      memset(&watch, 0, sizeof(watch));
++      watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", backend);
++      if (!watch.node)
++              return;
++
++      watch.callback = xenbus_reset_state_changed;
++      be_state = XenbusStateUnknown;
++
++      printk(KERN_INFO "XENBUS: triggering reconnect on %s\n", backend);
++      register_xenbus_watch(&watch);
++
++      xenbus_printf(XBT_NIL, frontend, "state", "%d", XenbusStateClosing);
++      wait_event_interruptible(be_state_wq, be_state == XenbusStateClosing);
   
--void xenbus_probe(struct work_struct *unused)
++      xenbus_printf(XBT_NIL, frontend, "state", "%d", XenbusStateClosed);
++      wait_event_interruptible(be_state_wq, be_state == XenbusStateClosed);
++
++      xenbus_printf(XBT_NIL, frontend, "state", "%d", XenbusStateInitialising);
++      wait_event_interruptible(be_state_wq, xenbus_reset_check_final(&be_state));
++
++      unregister_xenbus_watch(&watch);
++      printk(KERN_INFO "XENBUS: reconnect done on %s\n", backend);
++      kfree(watch.node);
++}
++
++static void xenbus_reset_check_state(char *class, char *dev)
++{
++      int state, err;
++      char *backend, *frontend;
++
++      frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev);
++      if (!frontend)
++              return;
++
++      err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &state);
++      /* frontend connected? */
++      if (err == 1 && state == XenbusStateConnected) {
++              backend = xenbus_read(XBT_NIL, frontend, "backend", NULL);
++              if (!backend || IS_ERR(backend))
++                      goto out;
++              err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &state);
++              /* backend connected? */
++              if (err == 1 && state == XenbusStateConnected)
++                      xenbus_reset_frontend_state(backend, frontend);
++              kfree(backend);
++      }
++out:
++      kfree(frontend);
++}
++
++static void xenbus_reset_state(void)
++{
++      char **devclass, **dev;
++      int devclass_n, dev_n;
++      int i, j;
++
++      devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n);
++      if (IS_ERR(devclass))
++              return;
++
++      for (i = 0; i < devclass_n; i++) {
++              dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n);
++              if (IS_ERR(dev))
++                      continue;
++              for (j = 0; j < dev_n; j++)
++                      xenbus_reset_check_state(devclass[i], dev[j]);
++              kfree(dev);
++      }
++      kfree(devclass);
++}
++#endif
++
++void
++#if defined(CONFIG_XEN_UNPRIVILEGED_GUEST)
++__init
++#elif defined(MODULE)
++__devinit
++#endif
++xenbus_probe(struct work_struct *unused)
   {
--      xenstored_ready = 1;
++      BUG_ON(!is_xenstored_ready());
++
++#ifdef CONFIG_CRASH_DUMP
++      /* reset devices in XenbusStateConnected state */
++      if (!is_initial_xendomain() && reset_devices)
++              xenbus_reset_state();
++#endif
++#if defined(CONFIG_XEN) || defined(MODULE)
++      /* Enumerate devices in xenstore and watch for changes. */
++      xenbus_probe_devices(&xenbus_frontend);
++      register_xenbus_watch(&fe_watch);
++      xenbus_backend_probe_and_watch();
++#endif
   
         /* Notify others that xenstore is up */
         blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
   }
--EXPORT_SYMBOL_GPL(xenbus_probe);
++PARAVIRT_EXPORT_SYMBOL(xenbus_probe);
   
++#if !defined(CONFIG_XEN) && !defined(MODULE)
   static int __init xenbus_probe_initcall(void)
   {
         if (!xen_domain())
@@@ -695,28 -695,28 +1110,155 @@@
   }
   
   device_initcall(xenbus_probe_initcall);
++#endif
++
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++#ifdef CONFIG_PROC_FS
++static struct file_operations xsd_kva_fops;
++static struct proc_dir_entry *xsd_kva_intf;
++static struct proc_dir_entry *xsd_port_intf;
++
++static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
++{
++      size_t size = vma->vm_end - vma->vm_start;
++      int old;
++      int rc;
++
++      old = atomic_cmpxchg(&xenbus_xsd_state,
++                         XENBUS_XSD_UNCOMMITTED,
++                         XENBUS_XSD_LOCAL_INIT);
++      switch (old) {
++              case XENBUS_XSD_UNCOMMITTED:
++                      rc = xb_init_comms();
++                      if (rc != 0)
++                              return rc;
++                      break;
++
++              case XENBUS_XSD_FOREIGN_INIT:
++              case XENBUS_XSD_FOREIGN_READY:
++                      return -EBUSY;
++
++              case XENBUS_XSD_LOCAL_INIT:
++              case XENBUS_XSD_LOCAL_READY:
++              default:
++                      break;
++      }
++
++      if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
++              return -EINVAL;
++
++      if (remap_pfn_range(vma, vma->vm_start, mfn_to_pfn(xen_store_mfn),
++                          size, vma->vm_page_prot))
++              return -EAGAIN;
++
++      return 0;
++}
++
++static int xsd_kva_read(char *page, char **start, off_t off,
++                      int count, int *eof, void *data)
++{
++      int len;
++
++      len  = sprintf(page, "0x%p", xen_store_interface);
++      *eof = 1;
++      return len;
++}
++
++static int xsd_port_read(char *page, char **start, off_t off,
++                       int count, int *eof, void *data)
++{
++      int len;
++
++      len  = sprintf(page, "%d", xen_store_evtchn);
++      *eof = 1;
++      return len;
++}
++#endif
++
++#ifdef CONFIG_XEN_XENBUS_DEV
++int xenbus_conn(domid_t remote_dom, grant_ref_t *grant_ref,
++              evtchn_port_t *local_port)
++{
++      struct evtchn_alloc_unbound alloc_unbound;
++      int rc, rc2;
++
++      BUG_ON(atomic_read(&xenbus_xsd_state) != XENBUS_XSD_FOREIGN_INIT);
++      BUG_ON(!is_initial_xendomain());
++
++      remove_xen_proc_entry("xsd_kva");
++      remove_xen_proc_entry("xsd_port");
++
++      rc = close_evtchn(xen_store_evtchn);
++      if (rc != 0)
++              goto fail0;
++
++      alloc_unbound.dom = DOMID_SELF;
++      alloc_unbound.remote_dom = remote_dom;
++      rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
++                                       &alloc_unbound);
++      if (rc != 0)
++              goto fail0;
++      *local_port = xen_store_evtchn = alloc_unbound.port;
++
++      /* keep the old page (xen_store_mfn, xen_store_interface) */
++      rc = gnttab_grant_foreign_access(remote_dom, xen_store_mfn,
++                                       GTF_permit_access);
++      if (rc < 0)
++              goto fail1;
++      *grant_ref = rc;
++
++      rc = xb_init_comms();
++      if (rc != 0)
++              goto fail1;
++
++      return 0;
   
--static int __init xenbus_init(void)
++fail1:
++      rc2 = close_evtchn(xen_store_evtchn);
++      if (rc2 != 0)
++              pr_warning("XENBUS: Error freeing xenstore event channel:"
++                         " %d\n", rc2);
++fail0:
++      xen_store_evtchn = -1;
++      return rc;
++}
++#endif
++#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
++
++#ifndef MODULE
++static int __init
++#else
++int __devinit
++#endif
++xenbus_init(void)
   {
         int err = 0;
         unsigned long page = 0;
   
         DPRINTK("");
   
--      err = -ENODEV;
--      if (!xen_domain())
--              return err;
++      if (!is_running_on_xen())
++              return -ENODEV;
++
++#if defined(CONFIG_XEN) || defined(MODULE)
++      /* Register ourselves with the kernel bus subsystem */
++      xenbus_frontend.error = bus_register(&xenbus_frontend.bus);
++      if (xenbus_frontend.error)
++              pr_warning("XENBUS: Error registering frontend bus: %i\n",
++                         xenbus_frontend.error);
++      xenbus_backend_bus_register();
++#endif
   
         /*
          * Domain0 doesn't have a store_evtchn or store_mfn yet.
          */
--      if (xen_initial_domain()) {
++      if (is_initial_xendomain()) {
                 struct evtchn_alloc_unbound alloc_unbound;
   
                 /* Allocate Xenstore page */
                 page = get_zeroed_page(GFP_KERNEL);
                 if (!page)
--                      goto out_error;
++                      return -ENOMEM;
   
                 xen_store_mfn = xen_start_info->store_mfn =
                         pfn_to_mfn(virt_to_phys((void *)page) >>
@@@ -724,47 -724,47 +1266,98 @@@
   
                 /* Next allocate a local port which xenstored can bind to */
                 alloc_unbound.dom        = DOMID_SELF;
--              alloc_unbound.remote_dom = 0;
++              alloc_unbound.remote_dom = DOMID_SELF;
   
                 err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
                                                   &alloc_unbound);
                 if (err == -ENOSYS)
--                      goto out_error;
++                      goto err;
   
                 BUG_ON(err);
                 xen_store_evtchn = xen_start_info->store_evtchn =
                         alloc_unbound.port;
   
++#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
++              /* And finally publish the above info in /proc/xen */
++              xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600);
++              if (xsd_kva_intf) {
++                      memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops,
++                             sizeof(xsd_kva_fops));
++                      xsd_kva_fops.mmap = xsd_kva_mmap;
++                      xsd_kva_intf->proc_fops = &xsd_kva_fops;
++                      xsd_kva_intf->read_proc = xsd_kva_read;
++              }
++              xsd_port_intf = create_xen_proc_entry("xsd_port", 0400);
++              if (xsd_port_intf)
++                      xsd_port_intf->read_proc = xsd_port_read;
++#endif
                 xen_store_interface = mfn_to_virt(xen_store_mfn);
         } else {
++#if !defined(CONFIG_XEN) && !defined(MODULE)
                 if (xen_hvm_domain()) {
++#endif
++#ifndef CONFIG_XEN
                         uint64_t v = 0;
++
                         err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
                         if (err)
--                              goto out_error;
++                              goto err;
                         xen_store_evtchn = (int)v;
                         err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
                         if (err)
--                              goto out_error;
++                              goto err;
                         xen_store_mfn = (unsigned long)v;
--                      xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
++                      xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT,
++                                                    PAGE_SIZE);
++#endif
++#if !defined(CONFIG_XEN) && !defined(MODULE)
                 } else {
++#endif
++#ifndef MODULE
                         xen_store_evtchn = xen_start_info->store_evtchn;
                         xen_store_mfn = xen_start_info->store_mfn;
                         xen_store_interface = mfn_to_virt(xen_store_mfn);
--                      xenstored_ready = 1;
++#endif
++#if !defined(CONFIG_XEN) && !defined(MODULE)
                 }
++#endif
++              atomic_set(&xenbus_xsd_state, XENBUS_XSD_FOREIGN_READY);
++
++              /* Initialize the shared memory rings to talk to xenstored */
++              err = xb_init_comms();
++              if (err)
++                      goto err;
         }
   
++#if defined(CONFIG_XEN) || defined(MODULE)
++      xenbus_dev_init();
++#endif
++
         /* Initialize the interface to xenstore. */
         err = xs_init();
         if (err) {
--              printk(KERN_WARNING
--                     "XENBUS: Error initializing xenstore comms: %i\n", err);
--              goto out_error;
++              pr_warning("XENBUS: Error initializing xenstore comms: %i\n",
++                         err);
++              goto err;
         }
   
--#ifdef CONFIG_XEN_COMPAT_XENFS
++#if defined(CONFIG_XEN) || defined(MODULE)
++      /* Register ourselves with the kernel device subsystem */
++      if (!xenbus_frontend.error) {
++              xenbus_frontend.error = device_register(&xenbus_frontend.dev);
++              if (xenbus_frontend.error) {
++                      bus_unregister(&xenbus_frontend.bus);
++                      pr_warning("XENBUS: Error registering frontend device:"
++                                 " %d\n", xenbus_frontend.error);
++              }
++      }
++      xenbus_backend_device_register();
++
++      if (!is_initial_xendomain())
++              xenbus_probe(NULL);
++#endif
++
++#if defined(CONFIG_XEN_COMPAT_XENFS) && !defined(MODULE)
         /*
          * Create xenfs mountpoint in /proc for compatibility with
          * utilities that expect to find "xenbus" under "/proc/xen".
@@@ -774,13 -774,13 +1367,164 @@@
   
         return 0;
   
--  out_error:
++ err:
++      /*
++       * Do not unregister the xenbus front/backend buses here. The buses
++       * must exist because front/backend drivers will use them when they are
++       * registered.
++       */
++
         if (page != 0)
                 free_page(page);
   
         return err;
   }
   
++#ifndef MODULE
   postcore_initcall(xenbus_init);
--
++#ifdef CONFIG_XEN
++MODULE_LICENSE("Dual BSD/GPL");
++#else
   MODULE_LICENSE("GPL");
++#endif
++#endif
++
++#if defined(CONFIG_XEN) || defined(MODULE)
++
++static int is_device_connecting(struct device *dev, void *data)
++{
++      struct xenbus_device *xendev = to_xenbus_device(dev);
++      struct device_driver *drv = data;
++      struct xenbus_driver *xendrv;
++
++      /*
++       * A device with no driver will never connect. We care only about
++       * devices which should currently be in the process of connecting.
++       */
++      if (!dev->driver)
++              return 0;
++
++      /* Is this search limited to a particular driver? */
++      if (drv && (dev->driver != drv))
++              return 0;
++
++      xendrv = to_xenbus_driver(dev->driver);
++      return (xendev->state < XenbusStateConnected ||
++              (xendev->state == XenbusStateConnected &&
++               xendrv->is_ready && !xendrv->is_ready(xendev)));
++}
++
++static int exists_connecting_device(struct device_driver *drv)
++{
++      if (xenbus_frontend.error)
++              return xenbus_frontend.error;
++      return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
++                              is_device_connecting);
++}
++
++static int print_device_status(struct device *dev, void *data)
++{
++      struct xenbus_device *xendev = to_xenbus_device(dev);
++      struct device_driver *drv = data;
++      struct xenbus_driver *xendrv;
++
++      /* Is this operation limited to a particular driver? */
++      if (drv && (dev->driver != drv))
++              return 0;
++
++      if (!dev->driver) {
++              /* Information only: is this too noisy? */
++              pr_info("XENBUS: Device with no driver: %s\n",
++                      xendev->nodename);
++              return 0;
++      }
++
++      if (xendev->state < XenbusStateConnected) {
++              enum xenbus_state rstate = XenbusStateUnknown;
++              if (xendev->otherend)
++                      rstate = xenbus_read_driver_state(xendev->otherend);
++              pr_warning("XENBUS: Timeout connecting to device: %s"
++                         " (local state %d, remote state %d)\n",
++                         xendev->nodename, xendev->state, rstate);
++      }
++
++      xendrv = to_xenbus_driver(dev->driver);
++      if (xendrv->is_ready && !xendrv->is_ready(xendev))
++              pr_warning("XENBUS: Device not ready: %s\n",
++                         xendev->nodename);
++
++      return 0;
++}
++
++/* We only wait for device setup after most initcalls have run. */
++static int ready_to_wait_for_devices;
++
++/*
++ * On a 5-minute timeout, wait for all devices currently configured.  We need
++ * to do this to guarantee that the filesystems and / or network devices
++ * needed for boot are available, before we can allow the boot to proceed.
++ *
++ * This needs to be on a late_initcall, to happen after the frontend device
++ * drivers have been initialised, but before the root fs is mounted.
++ *
++ * A possible improvement here would be to have the tools add a per-device
++ * flag to the store entry, indicating whether it is needed at boot time.
++ * This would allow people who knew what they were doing to accelerate their
++ * boot slightly, but of course needs tools or manual intervention to set up
++ * those flags correctly.
++ */
++static void wait_for_devices(struct xenbus_driver *xendrv)
++{
++      unsigned long start = jiffies;
++      struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
++      unsigned int seconds_waited = 0;
++
++      if (!ready_to_wait_for_devices || !is_running_on_xen())
++              return;
++
++      while (exists_connecting_device(drv)) {
++              if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
++                      if (!seconds_waited)
++                              pr_warning("XENBUS: Waiting for "
++                                         "devices to initialise: ");
++                      seconds_waited += 5;
++                      printk("%us...", 300 - seconds_waited);
++                      if (seconds_waited == 300)
++                              break;
++              }
++
++              schedule_timeout_interruptible(HZ/10);
++      }
++
++      if (seconds_waited)
++              printk("\n");
++
++      bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
++                       print_device_status);
++}
++
++#ifndef MODULE
++static int __init boot_wait_for_devices(void)
++{
++#if !defined(CONFIG_XEN) && !defined(MODULE)
++      if (xen_hvm_domain() && !xen_platform_pci_unplug)
++              return -ENODEV;
++#endif
++
++      if (!xenbus_frontend.error) {
++              ready_to_wait_for_devices = 1;
++              wait_for_devices(NULL);
++      }
++      return 0;
++}
++
++late_initcall(boot_wait_for_devices);
++#endif
++
++int xenbus_for_each_frontend(void *arg, int (*fn)(struct device *, void *))
++{
++      return bus_for_each_dev(&xenbus_frontend.bus, NULL, arg, fn);
++}
++EXPORT_SYMBOL_GPL(xenbus_for_each_frontend);
++
++#endif /* CONFIG_XEN || MODULE */
diff --cc drivers/xen/xenbus/xenbus_probe.h

index 888b990,888b990..22a3de4
--- 1/drivers/xen/xenbus/xenbus_probe.h
--- 2/drivers/xen/xenbus/xenbus_probe.h
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@@ -34,17 -34,17 +34,47 @@@
   #ifndef _XENBUS_PROBE_H
   #define _XENBUS_PROBE_H
   
++#ifndef BUS_ID_SIZE
   #define XEN_BUS_ID_SIZE                       20
++#else
++#define XEN_BUS_ID_SIZE                       BUS_ID_SIZE
++#endif
++
++#ifdef CONFIG_PARAVIRT_XEN
++#define is_running_on_xen() xen_domain()
++#define is_initial_xendomain() xen_initial_domain()
++#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
++#define dev_name(dev) ((dev)->bus_id)
++#endif
++
++#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
++extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
++extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
++extern void xenbus_backend_probe_and_watch(void);
++extern void xenbus_backend_bus_register(void);
++extern void xenbus_backend_device_register(void);
++#else
++static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
++static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
++static inline void xenbus_backend_probe_and_watch(void) {}
++static inline void xenbus_backend_bus_register(void) {}
++static inline void xenbus_backend_device_register(void) {}
++#endif
   
   struct xen_bus_type
   {
         char *root;
++      int error;
         unsigned int levels;
         int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
         int (*probe)(struct xen_bus_type *bus, const char *type,
                      const char *dir);
++#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
         void (*otherend_changed)(struct xenbus_watch *watch, const char **vec,
                                  unsigned int len);
++#else
++      struct device dev;
++#endif
         struct bus_type bus;
   };
   
diff --cc drivers/xen/xenbus/xenbus_probe_backend.c

index 6cf467b,6cf467b..2641774
--- 1/drivers/xen/xenbus/xenbus_probe_backend.c
--- 2/drivers/xen/xenbus/xenbus_probe_backend.c
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@@ -36,23 -36,23 +36,35 @@@
                  __func__, __LINE__, ##args)
   
   #include <linux/kernel.h>
++#include <linux/version.h>
   #include <linux/err.h>
   #include <linux/string.h>
   #include <linux/ctype.h>
   #include <linux/fcntl.h>
   #include <linux/mm.h>
++#include <linux/slab.h>
   #include <linux/notifier.h>
   
   #include <asm/page.h>
   #include <asm/pgtable.h>
++#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
   #include <asm/xen/hypervisor.h>
++#endif
   #include <asm/hypervisor.h>
   #include <xen/xenbus.h>
++#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
++#include <xen/xen_proc.h>
++#include <xen/evtchn.h>
++#endif
   #include <xen/features.h>
   
   #include "xenbus_comms.h"
   #include "xenbus_probe.h"
   
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
   /* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
   static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
   {
@@@ -177,11 -177,11 +189,13 @@@ static int xenbus_probe_backend(struct 
         return err;
   }
   
++#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
   static void frontend_changed(struct xenbus_watch *watch,
                             const char **vec, unsigned int len)
   {
         xenbus_otherend_changed(watch, vec, len, 0);
   }
++#endif
   
   static struct device_attribute xenbus_backend_dev_attrs[] = {
         __ATTR_NULL
@@@ -192,14 -192,14 +206,23 @@@ static struct xen_bus_type xenbus_backe
         .levels = 3,            /* backend/type/<frontend>/<id> */
         .get_bus_id = backend_bus_id,
         .probe = xenbus_probe_backend,
++#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
         .otherend_changed = frontend_changed,
++#else
++      .dev = {
++              .init_name = "xen-backend",
++      },
++#endif
++      .error = -ENODEV,
         .bus = {
                 .name           = "xen-backend",
                 .match          = xenbus_match,
                 .uevent         = xenbus_uevent_backend,
                 .probe          = xenbus_dev_probe,
                 .remove         = xenbus_dev_remove,
++#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
                 .shutdown       = xenbus_dev_shutdown,
++#endif
                 .dev_attrs      = xenbus_backend_dev_attrs,
         },
   };
@@@ -222,6 -222,6 +245,7 @@@ static int read_frontend_details(struc
         return xenbus_read_otherend_details(xendev, "frontend-id", "frontend");
   }
   
++#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
   int xenbus_dev_is_online(struct xenbus_device *dev)
   {
         int rc, val;
@@@ -233,6 -233,6 +257,7 @@@
         return val;
   }
   EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
++#endif
   
   int __xenbus_register_backend(struct xenbus_driver *drv,
                               struct module *owner, const char *mod_name)
@@@ -244,17 -244,17 +269,43 @@@
   }
   EXPORT_SYMBOL_GPL(__xenbus_register_backend);
   
++#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
++
++void xenbus_backend_suspend(int (*fn)(struct device *, void *))
++{
++      DPRINTK("");
++      if (!xenbus_backend.error)
++              bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
++}
++
++void xenbus_backend_resume(int (*fn)(struct device *, void *))
++{
++      DPRINTK("");
++      if (!xenbus_backend.error)
++              bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
++}
++
++#endif
++
++#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
   static int backend_probe_and_watch(struct notifier_block *notifier,
                                    unsigned long event,
                                    void *data)
++#else
++void xenbus_backend_probe_and_watch(void)
++#endif
   {
         /* Enumerate devices in xenstore and watch for changes. */
         xenbus_probe_devices(&xenbus_backend);
         register_xenbus_watch(&be_watch);
   
++#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
         return NOTIFY_DONE;
++#endif
   }
   
++#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
++
   static int __init xenbus_probe_backend_init(void)
   {
         static struct notifier_block xenstore_notifier = {
@@@ -274,3 -274,3 +325,34 @@@
         return 0;
   }
   subsys_initcall(xenbus_probe_backend_init);
++
++#else
++
++void xenbus_backend_bus_register(void)
++{
++      xenbus_backend.error = bus_register(&xenbus_backend.bus);
++      if (xenbus_backend.error)
++              pr_warning("XENBUS: Error registering backend bus: %i\n",
++                         xenbus_backend.error);
++}
++
++void xenbus_backend_device_register(void)
++{
++      if (xenbus_backend.error)
++              return;
++
++      xenbus_backend.error = device_register(&xenbus_backend.dev);
++      if (xenbus_backend.error) {
++              bus_unregister(&xenbus_backend.bus);
++              pr_warning("XENBUS: Error registering backend device: %i\n",
++                         xenbus_backend.error);
++      }
++}
++
++int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *))
++{
++      return bus_for_each_dev(&xenbus_backend.bus, NULL, arg, fn);
++}
++EXPORT_SYMBOL_GPL(xenbus_for_each_backend);
++
++#endif
diff --cc drivers/xen/xenbus/xenbus_xs.c

index 5534690,5534690..0aaa54a
--- 1/drivers/xen/xenbus/xenbus_xs.c
--- 2/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@@ -47,6 -47,6 +47,14 @@@
   #include <xen/xenbus.h>
   #include "xenbus_comms.h"
   
++#ifdef HAVE_XEN_PLATFORM_COMPAT_H
++#include <xen/platform-compat.h>
++#endif
++
++#ifndef PF_NOFREEZE /* Old kernel (pre-2.6.6). */
++#define PF_NOFREEZE   0
++#endif
++
   struct xs_stored_msg {
         struct list_head list;
   
@@@ -118,7 -118,7 +126,7 @@@ static DEFINE_SPINLOCK(watch_events_loc
    * carrying out work.
    */
   static pid_t xenwatch_pid;
--static DEFINE_MUTEX(xenwatch_mutex);
++/* static */ DEFINE_MUTEX(xenwatch_mutex);
   static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
   
   static int get_error(const char *errorstring)
@@@ -127,9 -127,9 +135,8 @@@
   
         for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
                 if (i == ARRAY_SIZE(xsd_errors) - 1) {
--                      printk(KERN_WARNING
--                             "XENBUS xen store gave: unknown error %s",
--                             errorstring);
++                      pr_warning("XENBUS xen store gave: unknown error %s",
++                                 errorstring);
                         return EINVAL;
                 }
         }
@@@ -212,14 -212,14 +219,16 @@@ void *xenbus_dev_request_and_reply(stru
   
         mutex_unlock(&xs_state.request_mutex);
   
--      if ((msg->type == XS_TRANSACTION_END) ||
++      if ((req_msg.type == XS_TRANSACTION_END) ||
             ((req_msg.type == XS_TRANSACTION_START) &&
              (msg->type == XS_ERROR)))
                 transaction_end();
   
         return ret;
   }
++#if !defined(CONFIG_XEN) && !defined(MODULE)
   EXPORT_SYMBOL(xenbus_dev_request_and_reply);
++#endif
   
   /* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
   static void *xs_talkv(struct xenbus_transaction t,
@@@ -271,9 -271,9 +280,9 @@@
   
         if (msg.type != type) {
                 if (printk_ratelimit())
--                      printk(KERN_WARNING
--                             "XENBUS unexpected type [%d], expected [%d]\n",
--                             msg.type, type);
++                      pr_warning("XENBUS unexpected type [%d],"
++                                 " expected [%d]\n",
++                                 msg.type, type);
                 kfree(ret);
                 return ERR_PTR(-EINVAL);
         }
@@@ -330,7 -330,7 +339,7 @@@ static char **split(char *strings, unsi
         char *p, **ret;
   
         /* Count the strings. */
--      *num = count_strings(strings, len);
++      *num = count_strings(strings, len) + 1;
   
         /* Transfer to one big alloc for easy freeing. */
         ret = kmalloc(*num * sizeof(char *) + len, GFP_NOIO | __GFP_HIGH);
@@@ -344,6 -344,6 +353,7 @@@
         strings = (char *)&ret[*num];
         for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
                 ret[(*num)++] = p;
++      ret[*num] = strings + len;
   
         return ret;
   }
@@@ -531,18 -531,18 +541,15 @@@ int xenbus_printf(struct xenbus_transac
   {
         va_list ap;
         int ret;
--#define PRINTF_BUFFER_SIZE 4096
         char *printf_buffer;
   
--      printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_NOIO | __GFP_HIGH);
--      if (printf_buffer == NULL)
--              return -ENOMEM;
--
         va_start(ap, fmt);
--      ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
++      printf_buffer = kvasprintf(GFP_NOIO | __GFP_HIGH, fmt, ap);
         va_end(ap);
   
--      BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
++      if (!printf_buffer)
++              return -ENOMEM;
++
         ret = xenbus_write(t, dir, node, printf_buffer);
   
         kfree(printf_buffer);
@@@ -657,6 -657,6 +664,10 @@@ void unregister_xenbus_watch(struct xen
         char token[sizeof(watch) * 2 + 1];
         int err;
   
++#if defined(CONFIG_XEN) || defined(MODULE)
++      BUG_ON(watch->flags & XBWF_new_thread);
++#endif
++
         sprintf(token, "%lX", (long)watch);
   
         down_read(&xs_state.watch_mutex);
@@@ -668,9 -668,9 +679,8 @@@
   
         err = xs_unwatch(watch->node, token);
         if (err)
--              printk(KERN_WARNING
--                     "XENBUS Failed to release watch %s: %i\n",
--                     watch->node, err);
++              pr_warning("XENBUS Failed to release watch %s: %i\n",
++                         watch->node, err);
   
         up_read(&xs_state.watch_mutex);
   
@@@ -708,7 -708,7 +718,9 @@@ void xs_resume(void
         struct xenbus_watch *watch;
         char token[sizeof(watch) * 2 + 1];
   
++#if !defined(CONFIG_XEN) && !defined(MODULE)
         xb_init_comms();
++#endif
   
         mutex_unlock(&xs_state.response_mutex);
         mutex_unlock(&xs_state.request_mutex);
@@@ -731,11 -731,11 +743,32 @@@ void xs_suspend_cancel(void
         mutex_unlock(&xs_state.transaction_mutex);
   }
   
++#if defined(CONFIG_XEN) || defined(MODULE)
++static int xenwatch_handle_callback(void *data)
++{
++      struct xs_stored_msg *msg = data;
++
++      msg->u.watch.handle->callback(msg->u.watch.handle,
++                                    (const char **)msg->u.watch.vec,
++                                    msg->u.watch.vec_size);
++
++      kfree(msg->u.watch.vec);
++      kfree(msg);
++
++      /* Kill this kthread if we were spawned just for this callback. */
++      if (current->pid != xenwatch_pid)
++              do_exit(0);
++
++      return 0;
++}
++#endif
++
   static int xenwatch_thread(void *unused)
   {
         struct list_head *ent;
         struct xs_stored_msg *msg;
   
++      current->flags |= PF_NOFREEZE;
         for (;;) {
                 wait_event_interruptible(watch_events_waitq,
                                          !list_empty(&watch_events));
@@@ -751,17 -751,17 +784,39 @@@
                         list_del(ent);
                 spin_unlock(&watch_events_lock);
   
--              if (ent != &watch_events) {
--                      msg = list_entry(ent, struct xs_stored_msg, list);
--                      msg->u.watch.handle->callback(
--                              msg->u.watch.handle,
--                              (const char **)msg->u.watch.vec,
--                              msg->u.watch.vec_size);
--                      kfree(msg->u.watch.vec);
--                      kfree(msg);
++              if (ent == &watch_events) {
++                      mutex_unlock(&xenwatch_mutex);
++                      continue;
                 }
   
++              msg = list_entry(ent, struct xs_stored_msg, list);
++
++#if defined(CONFIG_XEN) || defined(MODULE)
++              /*
++               * Unlock the mutex before running an XBWF_new_thread
++               * handler. kthread_run can block which can deadlock
++               * against unregister_xenbus_watch() if we need to
++               * unregister other watches in order to make
++               * progress. This can occur on resume before the swap
++               * device is attached.
++               */
++              if (msg->u.watch.handle->flags & XBWF_new_thread) {
++                      mutex_unlock(&xenwatch_mutex);
++                      kthread_run(xenwatch_handle_callback,
++                                  msg, "xenwatch_cb");
++              } else {
++                      xenwatch_handle_callback(msg);
++                      mutex_unlock(&xenwatch_mutex);
++              }
++#else
++              msg->u.watch.handle->callback(
++                      msg->u.watch.handle,
++                      (const char **)msg->u.watch.vec,
++                      msg->u.watch.vec_size);
                 mutex_unlock(&xenwatch_mutex);
++              kfree(msg->u.watch.vec);
++              kfree(msg);
++#endif
         }
   
         return 0;
@@@ -855,11 -855,11 +910,12 @@@ static int xenbus_thread(void *unused
   {
         int err;
   
++      current->flags |= PF_NOFREEZE;
         for (;;) {
                 err = process_msg();
                 if (err)
--                      printk(KERN_WARNING "XENBUS error %d while reading "
--                             "message\n", err);
++                      pr_warning("XENBUS error %d while reading "
++                                 "message\n", err);
                 if (kthread_should_stop())
                         break;
         }
@@@ -869,7 -869,7 +925,6 @@@
   
   int xs_init(void)
   {
--      int err;
         struct task_struct *task;
   
         INIT_LIST_HEAD(&xs_state.reply_list);
@@@ -883,11 -883,11 +938,6 @@@
         atomic_set(&xs_state.transaction_count, 0);
         init_waitqueue_head(&xs_state.transaction_wq);
   
--      /* Initialize the shared memory rings to talk to xenstored */
--      err = xb_init_comms();
--      if (err)
--              return err;
--
         task = kthread_run(xenwatch_thread, NULL, "xenwatch");
         if (IS_ERR(task))
                 return PTR_ERR(task);
diff --cc drivers/xen/xenoprof/xenoprofile.c

index 0000000,0000000..c1dc189

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/drivers/xen/xenoprof/xenoprofile.c
@@@ -1,0 -1,0 +1,572 @@@
++/**
++ * @file xenoprofile.c
++ *
++ * @remark Copyright 2002 OProfile authors
++ * @remark Read the file COPYING
++ *
++ * @author John Levon <levon@movementarian.org>
++ *
++ * Modified by Aravind Menon and Jose Renato Santos for Xen
++ * These modifications are:
++ * Copyright (C) 2005 Hewlett-Packard Co.
++ *
++ * Separated out arch-generic part
++ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
++ *                    VA Linux Systems Japan K.K.
++ */
++
++#include <linux/init.h>
++#include <linux/notifier.h>
++#include <linux/smp.h>
++#include <linux/oprofile.h>
++#include <linux/syscore_ops.h>
++#include <linux/slab.h>
++#include <linux/interrupt.h>
++#include <linux/vmalloc.h>
++#include <asm/pgtable.h>
++#include <xen/evtchn.h>
++#include <xen/xenoprof.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/xenoprof.h>
++#include "../../../drivers/oprofile/event_buffer.h"
++
++#define MAX_XENOPROF_SAMPLES 16
++
++/* sample buffers shared with Xen */
++static xenoprof_buf_t **__read_mostly xenoprof_buf;
++/* Shared buffer area */
++static struct xenoprof_shared_buffer shared_buffer;
++
++/* Passive sample buffers shared with Xen */
++static xenoprof_buf_t **__read_mostly p_xenoprof_buf[MAX_OPROF_DOMAINS];
++/* Passive shared buffer area */
++static struct xenoprof_shared_buffer p_shared_buffer[MAX_OPROF_DOMAINS];
++
++static int xenoprof_start(void);
++static void xenoprof_stop(void);
++
++static int xenoprof_enabled = 0;
++static int xenoprof_is_primary = 0;
++static int active_defined;
++
++extern unsigned long oprofile_backtrace_depth;
++
++/* Number of buffers in shared area (one per VCPU) */
++static int nbuf;
++/* Mappings of VIRQ_XENOPROF to irq number (per cpu) */
++static int ovf_irq[NR_CPUS];
++/* cpu model type string - copied from Xen on XENOPROF_init command */
++static char cpu_type[XENOPROF_CPU_TYPE_SIZE];
++
++#ifdef CONFIG_PM_SLEEP
++
++static int xenoprof_suspend(void)
++{
++      if (xenoprof_enabled == 1)
++              xenoprof_stop();
++      return 0;
++}
++
++
++static void xenoprof_resume(void)
++{
++      if (xenoprof_enabled == 1)
++              xenoprof_start();
++}
++
++
++static struct syscore_ops oprofile_syscore_ops = {
++      .resume         = xenoprof_resume,
++      .suspend        = xenoprof_suspend
++};
++
++
++static int __init init_driverfs(void)
++{
++      register_syscore_ops(&oprofile_syscore_ops);
++      return 0;
++}
++
++
++static void exit_driverfs(void)
++{
++      unregister_syscore_ops(&oprofile_syscore_ops);
++}
++
++#else
++#define init_driverfs() do { } while (0)
++#define exit_driverfs() do { } while (0)
++#endif /* CONFIG_PM_SLEEP */
++
++static unsigned long long oprofile_samples;
++static unsigned long long p_oprofile_samples;
++
++static unsigned int pdomains;
++static struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS];
++
++/* Check whether the given entry is an escape code */
++static int xenoprof_is_escape(xenoprof_buf_t * buf, int tail)
++{
++      return (buf->event_log[tail].eip == XENOPROF_ESCAPE_CODE);
++}
++
++/* Get the event at the given entry  */
++static uint8_t xenoprof_get_event(xenoprof_buf_t * buf, int tail)
++{
++      return (buf->event_log[tail].event);
++}
++
++static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive)
++{
++      int head, tail, size;
++      int tracing = 0;
++
++      head = buf->event_head;
++      tail = buf->event_tail;
++      size = buf->event_size;
++
++      while (tail != head) {
++              if (xenoprof_is_escape(buf, tail) &&
++                  xenoprof_get_event(buf, tail) == XENOPROF_TRACE_BEGIN) {
++                      tracing=1;
++                      oprofile_add_mode(buf->event_log[tail].mode);
++                      if (!is_passive)
++                              oprofile_samples++;
++                      else
++                              p_oprofile_samples++;
++                      
++              } else {
++                      oprofile_add_pc(buf->event_log[tail].eip,
++                                      buf->event_log[tail].mode,
++                                      buf->event_log[tail].event);
++                      if (!tracing) {
++                              if (!is_passive)
++                                      oprofile_samples++;
++                              else
++                                      p_oprofile_samples++;
++                      }
++       
++              }
++              tail++;
++              if(tail==size)
++                  tail=0;
++      }
++      buf->event_tail = tail;
++}
++
++static void xenoprof_handle_passive(void)
++{
++      int i, j;
++      int flag_domain, flag_switch = 0;
++      
++      for (i = 0; i < pdomains; i++) {
++              flag_domain = 0;
++              for (j = 0; j < passive_domains[i].nbuf; j++) {
++                      xenoprof_buf_t *buf = p_xenoprof_buf[i][j];
++                      if (buf->event_head == buf->event_tail)
++                              continue;
++                      if (!flag_domain) {
++                              if (!oprofile_add_domain_switch(
++                                      passive_domains[i].domain_id))
++                                      goto done;
++                              flag_domain = 1;
++                      }
++                      xenoprof_add_pc(buf, 1);
++                      flag_switch = 1;
++              }
++      }
++done:
++      if (flag_switch)
++              oprofile_add_domain_switch(COORDINATOR_DOMAIN);
++}
++
++static irqreturn_t xenoprof_ovf_interrupt(int irq, void *dev_id)
++{
++      struct xenoprof_buf * buf;
++      static unsigned long flag;
++
++      buf = xenoprof_buf[smp_processor_id()];
++
++      xenoprof_add_pc(buf, 0);
++
++      if (xenoprof_is_primary && !test_and_set_bit(0, &flag)) {
++              xenoprof_handle_passive();
++              smp_mb__before_clear_bit();
++              clear_bit(0, &flag);
++      }
++
++      return IRQ_HANDLED;
++}
++
++static struct irqaction ovf_action = {
++      .handler = xenoprof_ovf_interrupt,
++      .flags   = IRQF_DISABLED,
++      .name    = "xenoprof"
++};
++
++static void unbind_virq(void)
++{
++      unsigned int i;
++
++      for_each_online_cpu(i) {
++              if (ovf_irq[i] >= 0) {
++                      unbind_from_per_cpu_irq(ovf_irq[i], i, &ovf_action);
++                      ovf_irq[i] = -1;
++              }
++      }
++}
++
++
++static int bind_virq(void)
++{
++      unsigned int i;
++      int result;
++
++      for_each_online_cpu(i) {
++              result = bind_virq_to_irqaction(VIRQ_XENOPROF, i, &ovf_action);
++
++              if (result < 0) {
++                      unbind_virq();
++                      return result;
++              }
++
++              ovf_irq[i] = result;
++      }
++              
++      return 0;
++}
++
++
++static xenoprof_buf_t **get_buffer_array(unsigned int nbuf)
++{
++      size_t size = nbuf * sizeof(xenoprof_buf_t);
++
++      if (size <= PAGE_SIZE)
++              return kmalloc(size, GFP_KERNEL);
++      return vmalloc(size);
++}
++
++static void release_buffer_array(xenoprof_buf_t **buf, unsigned int nbuf)
++{
++      if (nbuf * sizeof(xenoprof_buf_t) <= PAGE_SIZE)
++              kfree(buf);
++      else
++              vfree(buf);
++}
++
++
++static void unmap_passive_list(void)
++{
++      int i;
++      for (i = 0; i < pdomains; i++) {
++              xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
++              release_buffer_array(p_xenoprof_buf[i],
++                                   passive_domains[i].nbuf);
++      }
++      pdomains = 0;
++}
++
++
++static int map_xenoprof_buffer(int max_samples)
++{
++      struct xenoprof_get_buffer get_buffer;
++      struct xenoprof_buf *buf;
++      int ret, i;
++
++      if ( shared_buffer.buffer )
++              return 0;
++
++      get_buffer.max_samples = max_samples;
++      ret = xenoprof_arch_map_shared_buffer(&get_buffer, &shared_buffer);
++      if (ret)
++              return ret;
++      nbuf = get_buffer.nbuf;
++
++      xenoprof_buf = get_buffer_array(nbuf);
++      if (!xenoprof_buf) {
++              xenoprof_arch_unmap_shared_buffer(&shared_buffer);
++              return -ENOMEM;
++      }
++
++      for (i=0; i< nbuf; i++) {
++              buf = (struct xenoprof_buf*) 
++                      &shared_buffer.buffer[i * get_buffer.bufsize];
++              BUG_ON(buf->vcpu_id >= nbuf);
++              xenoprof_buf[buf->vcpu_id] = buf;
++      }
++
++      return 0;
++}
++
++
++static int xenoprof_setup(void)
++{
++      int ret;
++
++      if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) )
++              return ret;
++
++      if ( (ret = bind_virq()) ) {
++              release_buffer_array(xenoprof_buf, nbuf);
++              return ret;
++      }
++
++      if (xenoprof_is_primary) {
++              /* Define dom0 as an active domain if not done yet */
++              if (!active_defined) {
++                      domid_t domid;
++                      ret = HYPERVISOR_xenoprof_op(
++                              XENOPROF_reset_active_list, NULL);
++                      if (ret)
++                              goto err;
++                      domid = 0;
++                      ret = HYPERVISOR_xenoprof_op(
++                              XENOPROF_set_active, &domid);
++                      if (ret)
++                              goto err;
++                      active_defined = 1;
++              }
++
++              if (oprofile_backtrace_depth > 0) {
++                      ret = HYPERVISOR_xenoprof_op(XENOPROF_set_backtrace, 
++                                                   &oprofile_backtrace_depth);
++                      if (ret)
++                              oprofile_backtrace_depth = 0;
++              }
++
++              ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL);
++              if (ret)
++                      goto err;
++              
++              xenoprof_arch_counter();
++              ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL);
++              if (ret)
++                      goto err;
++      }
++
++      ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL);
++      if (ret)
++              goto err;
++
++      xenoprof_enabled = 1;
++      return 0;
++ err:
++      unbind_virq();
++      release_buffer_array(xenoprof_buf, nbuf);
++      return ret;
++}
++
++
++static void xenoprof_shutdown(void)
++{
++      xenoprof_enabled = 0;
++
++      WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL));
++
++      if (xenoprof_is_primary) {
++              WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_release_counters,
++                                             NULL));
++              active_defined = 0;
++      }
++
++      unbind_virq();
++
++      xenoprof_arch_unmap_shared_buffer(&shared_buffer);
++      if (xenoprof_is_primary)
++              unmap_passive_list();
++      release_buffer_array(xenoprof_buf, nbuf);
++}
++
++
++static int xenoprof_start(void)
++{
++      int ret = 0;
++
++      if (xenoprof_is_primary)
++              ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL);
++      if (!ret)
++              xenoprof_arch_start();
++      return ret;
++}
++
++
++static void xenoprof_stop(void)
++{
++      if (xenoprof_is_primary)
++              WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL));
++      xenoprof_arch_stop();
++}
++
++
++static int xenoprof_set_active(int * active_domains,
++                             unsigned int adomains)
++{
++      int ret = 0;
++      int i;
++      int set_dom0 = 0;
++      domid_t domid;
++
++      if (!xenoprof_is_primary)
++              return 0;
++
++      if (adomains > MAX_OPROF_DOMAINS)
++              return -E2BIG;
++
++      ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
++      if (ret)
++              return ret;
++
++      for (i=0; i<adomains; i++) {
++              domid = active_domains[i];
++              if (domid != active_domains[i]) {
++                      ret = -EINVAL;
++                      goto out;
++              }
++              ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
++              if (ret)
++                      goto out;
++              if (active_domains[i] == 0)
++                      set_dom0 = 1;
++      }
++      /* dom0 must always be active but may not be in the list */ 
++      if (!set_dom0) {
++              domid = 0;
++              ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
++      }
++
++out:
++      if (ret)
++              WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list,
++                                             NULL));
++      active_defined = !ret;
++      return ret;
++}
++
++static int xenoprof_set_passive(int * p_domains,
++                                unsigned int pdoms)
++{
++      int ret;
++      unsigned int i, j;
++      struct xenoprof_buf *buf;
++
++      if (!xenoprof_is_primary)
++              return 0;
++
++      if (pdoms > MAX_OPROF_DOMAINS)
++              return -E2BIG;
++
++      ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL);
++      if (ret)
++              return ret;
++      unmap_passive_list();
++
++      for (i = 0; i < pdoms; i++) {
++              passive_domains[i].domain_id = p_domains[i];
++              passive_domains[i].max_samples = 2048;
++              ret = xenoprof_arch_set_passive(&passive_domains[i],
++                                              &p_shared_buffer[i]);
++              if (ret)
++                      goto out;
++
++              p_xenoprof_buf[i] = get_buffer_array(passive_domains[i].nbuf);
++              if (!p_xenoprof_buf[i]) {
++                      ++i;
++                      ret = -ENOMEM;
++                      goto out;
++              }
++
++              for (j = 0; j < passive_domains[i].nbuf; j++) {
++                      buf = (struct xenoprof_buf *)
++                              &p_shared_buffer[i].buffer[
++                              j * passive_domains[i].bufsize];
++                      BUG_ON(buf->vcpu_id >= passive_domains[i].nbuf);
++                      p_xenoprof_buf[i][buf->vcpu_id] = buf;
++              }
++      }
++
++      pdomains = pdoms;
++      return 0;
++
++out:
++      for (j = 0; j < i; j++) {
++              xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
++              release_buffer_array(p_xenoprof_buf[i],
++                                   passive_domains[i].nbuf);
++      }
++
++      return ret;
++}
++
++
++/* The dummy backtrace function to keep oprofile happy
++ * The real backtrace is done in xen
++ */
++static void xenoprof_dummy_backtrace(struct pt_regs * const regs, 
++                                   unsigned int depth)
++{
++      /* this should never be called */
++      BUG();
++      return;
++}
++
++
++static struct oprofile_operations xenoprof_ops = {
++#ifdef HAVE_XENOPROF_CREATE_FILES
++      .create_files   = xenoprof_create_files,
++#endif
++      .set_active     = xenoprof_set_active,
++      .set_passive    = xenoprof_set_passive,
++      .setup          = xenoprof_setup,
++      .shutdown       = xenoprof_shutdown,
++      .start          = xenoprof_start,
++      .stop           = xenoprof_stop,
++      .backtrace      = xenoprof_dummy_backtrace
++};
++
++
++/* in order to get driverfs right */
++static int using_xenoprof;
++
++int __init xenoprofile_init(struct oprofile_operations * ops)
++{
++      struct xenoprof_init init;
++      unsigned int i;
++      int ret;
++
++      ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init);
++      if (!ret) {
++              xenoprof_arch_init_counter(&init);
++              xenoprof_is_primary = init.is_primary;
++
++              /*  cpu_type is detected by Xen */
++              cpu_type[XENOPROF_CPU_TYPE_SIZE-1] = 0;
++              strncpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE - 1);
++              xenoprof_ops.cpu_type = cpu_type;
++
++              init_driverfs();
++              using_xenoprof = 1;
++              *ops = xenoprof_ops;
++
++              for (i=0; i<NR_CPUS; i++)
++                      ovf_irq[i] = -1;
++
++              active_defined = 0;
++      }
++
++      pr_info("%s: ret %d, events %d, xenoprof_is_primary %d\n",
++              __func__, ret, init.num_events, xenoprof_is_primary);
++      return ret;
++}
++
++
++void xenoprofile_exit(void)
++{
++      if (using_xenoprof)
++              exit_driverfs();
++
++      xenoprof_arch_unmap_shared_buffer(&shared_buffer);
++      if (xenoprof_is_primary) {
++              unmap_passive_list();
++              WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL));
++        }
++}
diff --cc fs/Kconfig

index bdd20ed,f3aa9b0..8d4cc1a
--- 1/fs/Kconfig
--- 2/fs/Kconfig
+++ b/fs/Kconfig
@@@ -158,6 -138,6 +141,7 @@@ config HUGETLBF
         bool "HugeTLB file system support"
         depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
                    SYS_SUPPORTS_HUGETLBFS || BROKEN
++      depends on !XEN
         help
           hugetlbfs is a filesystem backing for HugeTLB pages, based on
           ramfs. For architectures that support it, say Y here and read
diff --cc fs/aio.c

index e29ec48,e29ec48..719c807
--- 1/fs/aio.c
--- 2/fs/aio.c
+++ b/fs/aio.c
@@@ -39,6 -39,6 +39,11 @@@
   #include <asm/kmap_types.h>
   #include <asm/uaccess.h>
   
++#ifdef CONFIG_EPOLL
++#include <linux/poll.h>
++#include <linux/anon_inodes.h>
++#endif
++
   #if DEBUG > 1
   #define dprintk               printk
   #else
@@@ -991,6 -991,6 +996,11 @@@ put_rq
         if (waitqueue_active(&ctx->wait))
                 wake_up(&ctx->wait);
   
++#ifdef CONFIG_EPOLL
++      if (ctx->file && waitqueue_active(&ctx->poll_wait))
++              wake_up(&ctx->poll_wait);
++#endif
++
         spin_unlock_irqrestore(&ctx->ctx_lock, flags);
         return ret;
   }
@@@ -999,6 -999,6 +1009,8 @@@ EXPORT_SYMBOL(aio_complete)
   /* aio_read_evt
    *    Pull an event off of the ioctx's event ring.  Returns the number of 
    *    events fetched (0 or 1 ;-)
++ *    If ent parameter is 0, just returns the number of events that would
++ *    be fetched.
    *    FIXME: make this use cmpxchg.
    *    TODO: make the ringbuffer user mmap()able (requires FIXME).
    */
@@@ -1021,13 -1021,13 +1033,18 @@@ static int aio_read_evt(struct kioctx *
   
         head = ring->head % info->nr;
         if (head != ring->tail) {
--              struct io_event *evp = aio_ring_event(info, head, KM_USER1);
--              *ent = *evp;
--              head = (head + 1) % info->nr;
--              smp_mb(); /* finish reading the event before updatng the head */
--              ring->head = head;
--              ret = 1;
--              put_aio_ring_event(evp, KM_USER1);
++              if (ent) { /* event requested */
++                      struct io_event *evp =
++                              aio_ring_event(info, head, KM_USER1);
++                      *ent = *evp;
++                      head = (head + 1) % info->nr;
++                      /* finish reading the event before updatng the head */
++                      smp_mb();
++                      ring->head = head;
++                      ret = 1;
++                      put_aio_ring_event(evp, KM_USER1);
++              } else /* only need to know availability */
++                      ret = 1;
         }
         spin_unlock(&info->ring_lock);
   
@@@ -1212,6 -1212,6 +1229,14 @@@ static void io_destroy(struct kioctx *i
   
         aio_cancel_all(ioctx);
         wait_for_all_aios(ioctx);
++#ifdef CONFIG_EPOLL
++      /* forget the poll file, but it's up to the user to close it */
++      if (ioctx->file) {
++              fput(ioctx->file);
++              ioctx->file->private_data = 0;
++              ioctx->file = 0;
++      }
++#endif
   
         /*
          * Wake up any waiters.  The setting of ctx->dead must be seen
@@@ -1222,6 -1222,6 +1247,70 @@@
         put_ioctx(ioctx);       /* once for the lookup */
   }
   
++#ifdef CONFIG_EPOLL
++
++static int aio_queue_fd_close(struct inode *inode, struct file *file)
++{
++      struct kioctx *ioctx = file->private_data;
++      if (ioctx) {
++              file->private_data = 0;
++              spin_lock_irq(&ioctx->ctx_lock);
++              ioctx->file = 0;
++              spin_unlock_irq(&ioctx->ctx_lock);
++              fput(file);
++      }
++      return 0;
++}
++
++static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait)
++{     unsigned int pollflags = 0;
++      struct kioctx *ioctx = file->private_data;
++
++      if (ioctx) {
++
++              spin_lock_irq(&ioctx->ctx_lock);
++              /* Insert inside our poll wait queue */
++              poll_wait(file, &ioctx->poll_wait, wait);
++
++              /* Check our condition */
++              if (aio_read_evt(ioctx, 0))
++                      pollflags = POLLIN | POLLRDNORM;
++              spin_unlock_irq(&ioctx->ctx_lock);
++      }
++
++      return pollflags;
++}
++
++static const struct file_operations aioq_fops = {
++      .release        = aio_queue_fd_close,
++      .poll           = aio_queue_fd_poll
++};
++
++/* make_aio_fd:
++ *  Create a file descriptor that can be used to poll the event queue.
++ *  Based on the excellent epoll code.
++ */
++
++static int make_aio_fd(struct kioctx *ioctx)
++{
++      int fd;
++      struct file *file;
++
++      fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
++      if (fd < 0)
++              return fd;
++
++      /* associate the file with the IO context */
++      file = fget(fd);
++      if (!file)
++              return -EBADF;
++      file->private_data = ioctx;
++      ioctx->file = file;
++      init_waitqueue_head(&ioctx->poll_wait);
++      return fd;
++}
++#endif
++
   /* sys_io_setup:
    *    Create an aio_context capable of receiving at least nr_events.
    *    ctxp must not point to an aio_context that already exists, and
@@@ -1234,18 -1234,18 +1323,30 @@@
    *    resources are available.  May fail with -EFAULT if an invalid
    *    pointer is passed for ctxp.  Will fail with -ENOSYS if not
    *    implemented.
++ *
++ *    To request a selectable fd, the user context has to be initialized
++ *    to 1, instead of 0, and the return value is the fd.
++ *    This keeps the system call compatible, since a non-zero value
++ *    was not allowed so far.
    */
   SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
   {
         struct kioctx *ioctx = NULL;
         unsigned long ctx;
         long ret;
++      int make_fd = 0;
   
         ret = get_user(ctx, ctxp);
         if (unlikely(ret))
                 goto out;
   
         ret = -EINVAL;
++#ifdef CONFIG_EPOLL
++      if (ctx == 1) {
++              make_fd = 1;
++              ctx = 0;
++      }
++#endif
         if (unlikely(ctx || nr_events == 0)) {
                 pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
                          ctx, nr_events);
@@@ -1256,8 -1256,8 +1357,12 @@@
         ret = PTR_ERR(ioctx);
         if (!IS_ERR(ioctx)) {
                 ret = put_user(ioctx->user_id, ctxp);
--              if (!ret)
--                      return 0;
++#ifdef CONFIG_EPOLL
++              if (make_fd && ret >= 0)
++                      ret = make_aio_fd(ioctx);
++#endif
++              if (ret >= 0)
++                      return ret;
   
                 get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
                 io_destroy(ioctx);
diff --cc fs/btrfs/extent_io.c

index c5d9fbb,ba41da5..21d65ad
--- 1/fs/btrfs/extent_io.c
--- 2/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@@ -10,8 -10,6 +10,7 @@@
   #include <linux/swap.h>
   #include <linux/writeback.h>
   #include <linux/pagevec.h>
- #include <linux/prefetch.h>
- #include <linux/cleancache.h>
++#include <linux/precache.h>
   #include "extent_io.h"
   #include "extent_map.h"
   #include "compat.h"
@@@ -1971,13 -2015,6 +2016,13 @@@ static int __extent_read_full_page(stru
   
         set_page_extent_mapped(page);
   
+ +      if (!PageUptodate(page)) {
-               if (cleancache_get_page(page) == 0) {
++              if (precache_get(page->mapping, page->index, page) == 1) {
+ +                      BUG_ON(blocksize != PAGE_SIZE);
+ +                      goto out;
+ +              }
+ +      }
+ +
         end = page_end;
         while (1) {
                 lock_extent(tree, start, end, GFP_NOFS);
@@@ -2109,9 -2146,8 +2154,9 @@@
                 if (ret)
                         SetPageError(page);
                 cur = cur + iosize;
-               pg_offset += iosize;
+               page_offset += iosize;
         }
+ +out:
         if (!nr) {
                 if (!PageError(page))
                         SetPageUptodate(page);
diff --cc fs/btrfs/super.c

index 9b2e7e5,0ac712e..b810ee3
--- 1/fs/btrfs/super.c
--- 2/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@@ -39,9 -39,7 +39,8 @@@
   #include <linux/miscdevice.h>
   #include <linux/magic.h>
   #include <linux/slab.h>
- #include <linux/cleancache.h>
++#include <linux/precache.h>
   #include "compat.h"
- #include "delayed-inode.h"
   #include "ctree.h"
   #include "disk-io.h"
   #include "transaction.h"
@@@ -633,7 -624,6 +625,7 @@@ static int btrfs_fill_super(struct supe
         sb->s_root = root_dentry;
   
         save_mount_options(sb, data);
-       cleancache_init_fs(sb);
++      precache_init(sb);
         return 0;
   
   fail_close:
diff --cc fs/buffer.c

index 49c9aad,a08bb8e..bbe6bea
--- 1/fs/buffer.c
--- 2/fs/buffer.c
+++ b/fs/buffer.c
@@@ -41,7 -41,6 +41,7 @@@
   #include <linux/bitops.h>
   #include <linux/mpage.h>
   #include <linux/bit_spinlock.h>
- #include <linux/cleancache.h>
++#include <linux/precache.h>
   
   static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
   
@@@ -270,10 -269,6 +270,11 @@@ void invalidate_bdev(struct block_devic
         invalidate_bh_lrus();
         lru_add_drain_all();    /* make sure all lru add caches are flushed */
         invalidate_mapping_pages(mapping, 0, -1);
-       /* 99% of the time, we don't need to flush the cleancache on the bdev.
++
++      /* 99% of the time, we don't need to flush the precache on the bdev.
+ +       * But, for the strange corners, lets be cautious
+ +       */
-       cleancache_flush_inode(mapping);
++      precache_flush_inode(mapping);
   }
   EXPORT_SYMBOL(invalidate_bdev);
   
diff --cc fs/compat_ioctl.c

index 61abb63,61abb63..217d14f
--- 1/fs/compat_ioctl.c
--- 2/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@@ -114,6 -114,6 +114,13 @@@
   #include <asm/fbio.h>
   #endif
   
++#ifdef CONFIG_XEN
++#include <xen/interface/xen.h>
++#include <xen/public/evtchn.h>
++#include <xen/public/privcmd.h>
++#include <xen/compat_ioctl.h>
++#endif
++
   static int w_long(unsigned int fd, unsigned int cmd,
                 compat_ulong_t __user *argp)
   {
@@@ -1408,6 -1408,6 +1415,16 @@@ IGNORE_IOCTL(FBIOGETCMAP32
   IGNORE_IOCTL(FBIOSCURSOR32)
   IGNORE_IOCTL(FBIOGCURSOR32)
   #endif
++
++#ifdef CONFIG_XEN
++COMPATIBLE_IOCTL(IOCTL_PRIVCMD_HYPERCALL)
++COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_VIRQ)
++COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_INTERDOMAIN)
++COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_UNBOUND_PORT)
++COMPATIBLE_IOCTL(IOCTL_EVTCHN_UNBIND)
++COMPATIBLE_IOCTL(IOCTL_EVTCHN_NOTIFY)
++COMPATIBLE_IOCTL(IOCTL_EVTCHN_RESET)
++#endif
   };
   
   /*
@@@ -1464,6 -1464,6 +1481,12 @@@ static long do_ioctl_trans(int fd, unsi
                 return do_video_stillpicture(fd, cmd, argp);
         case VIDEO_SET_SPU_PALETTE:
                 return do_video_set_spu_palette(fd, cmd, argp);
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++      case IOCTL_PRIVCMD_MMAP_32:
++      case IOCTL_PRIVCMD_MMAPBATCH_32:
++      case IOCTL_PRIVCMD_MMAPBATCH_V2_32:
++              return privcmd_ioctl_32(fd, cmd, argp);
++#endif
         }
   
         /*
diff --cc fs/ext3/super.c

index 65dec9e,3c6a9e0..c97482e
--- 1/fs/ext3/super.c
--- 2/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@@ -36,7 -36,6 +36,7 @@@
   #include <linux/quotaops.h>
   #include <linux/seq_file.h>
   #include <linux/log2.h>
- #include <linux/cleancache.h>
++#include <linux/precache.h>
   
   #include <asm/uaccess.h>
   
@@@ -1368,7 -1367,6 +1368,7 @@@ static int ext3_setup_super(struct supe
         } else {
                 ext3_msg(sb, KERN_INFO, "using internal journal");
         }
-       cleancache_init_fs(sb);
++      precache_init(sb);
         return res;
   }
   
diff --cc fs/ext4/Makefile
Simple merge
diff --cc fs/ext4/ext4.h
Simple merge
diff --cc fs/ext4/file.c
Simple merge
diff --cc fs/ext4/inode.c
Simple merge
diff --cc fs/ext4/namei.c
Simple merge
diff --cc fs/ext4/super.c

index db13f00,8553dfb..9e42e0d
--- 1/fs/ext4/super.c
--- 2/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@@ -38,7 -38,6 +38,7 @@@
   #include <linux/ctype.h>
   #include <linux/log2.h>
   #include <linux/crc16.h>
- #include <linux/cleancache.h>
++#include <linux/precache.h>
   #include <asm/uaccess.h>
   
   #include <linux/kthread.h>
@@@ -1972,7 -1932,6 +1956,8 @@@ static int ext4_setup_super(struct supe
                         EXT4_INODES_PER_GROUP(sb),
                         sbi->s_mount_opt, sbi->s_mount_opt2);
   
-       cleancache_init_fs(sb);
++      precache_init(sb);
++
         return res;
   }
   
diff --cc fs/ext4/xattr.c
Simple merge
diff --cc fs/mpage.c

index fdfae9f,0afc809..4d4d585
--- 1/fs/mpage.c
--- 2/fs/mpage.c
+++ b/fs/mpage.c
@@@ -27,7 -27,6 +27,7 @@@
   #include <linux/writeback.h>
   #include <linux/backing-dev.h>
   #include <linux/pagevec.h>
- #include <linux/cleancache.h>
++#include <linux/precache.h>
   
   /*
    * I/O completion handler for multipage BIOs.
@@@ -272,12 -271,6 +272,12 @@@ do_mpage_readpage(struct bio *bio, stru
                 SetPageMappedToDisk(page);
         }
   
+ +      if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
-           cleancache_get_page(page) == 0) {
++          precache_get(page->mapping, page->index, page) == 1) {
+ +              SetPageUptodate(page);
+ +              goto confused;
+ +      }
+ +
         /*
          * This page will go to BIO.  Do we need to send this BIO off first?
          */
diff --cc fs/namei.c
Simple merge
diff --cc fs/nfs/dir.c
Simple merge
diff --cc fs/nfs/inode.c
Simple merge
diff --cc fs/novfs/inode.c

index d753792,0000000..0f2b043

mode 100644,000000..100644
--- 1/fs/novfs/inode.c
--- /dev/null
+++ b/fs/novfs/inode.c
@@@ -1,4072 -1,0 +1,4104 @@@
+ +/*
+ + * Novell NCP Redirector for Linux
+ + * Author: James Turner
+ + *
+ + * This file contains functions used to control access to the Linux file
+ + * system.
+ + *
+ + * Copyright (C) 2005 Novell, Inc.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + */
+ +
+ +#include <linux/module.h>
+ +#include <linux/init.h>
+ +#include <linux/fs.h>
+ +#include <linux/dcache.h>
+ +#include <linux/mount.h>
+ +#include <linux/pagemap.h>
+ +#include <linux/string.h>
+ +#include <linux/slab.h>
+ +#include <linux/unistd.h>
+ +#include <asm/statfs.h>
+ +#include <asm/uaccess.h>
+ +#include <linux/ctype.h>
+ +#include <linux/statfs.h>
+ +#include <linux/pagevec.h>
+ +#include <linux/writeback.h>
+ +#include <linux/backing-dev.h>
+ +#include <linux/mm.h>
+ +#include <linux/file.h>
+ +
+ +/*===[ Include files specific to this module ]============================*/
+ +#include "vfs.h"
+ +
+ +struct inode_data {
+ +      void *Scope;
+ +      unsigned long Flags;
+ +      struct list_head IList;
+ +      struct inode *Inode;
+ +      unsigned long cntDC;
+ +      struct list_head DirCache;
+ +      struct mutex DirCacheLock;
+ +      void *FileHandle;
+ +      int CacheFlag;
+ +      char Name[1];           /* Needs to be last entry */
+ +};
+ +
+ +#define FILE_UPDATE_TIMEOUT   2
+ +
+ +/*===[ Function prototypes ]=============================================*/
+ +
+ +static unsigned long novfs_internal_hash(struct qstr *name);
+ +static int novfs_d_add(struct dentry *p, struct dentry *d, struct inode *i, int add);
+ +
+ +static void novfs_kill_sb(struct super_block *SB);
+ +
+ +/*
+ + * Declared dentry_operations
+ + */
+ +int novfs_d_revalidate(struct dentry *, struct nameidata *);
+ +int novfs_d_hash(const struct dentry *, const struct inode *, struct qstr *);
+ +int novfs_d_compare(const struct dentry *, const struct inode *,
+ +                  const struct dentry *, const struct inode *,
+ +                  unsigned int, const char *, const struct qstr *);
+ +int novfs_d_delete(struct dentry *dentry);
+ +void novfs_d_release(struct dentry *dentry);
+ +void novfs_d_iput(struct dentry *dentry, struct inode *inode);
+ +
+ +/*
+ + * Declared directory operations
+ + */
+ +int novfs_dir_open(struct inode *inode, struct file *file);
+ +int novfs_dir_release(struct inode *inode, struct file *file);
+ +loff_t novfs_dir_lseek(struct file *file, loff_t offset, int origin);
+ +ssize_t novfs_dir_read(struct file *file, char *buf, size_t len, loff_t * off);
+ +void addtodentry(struct dentry *Parent, unsigned char *List, int Level);
+ +int novfs_filldir(void *data, const char *name, int namelen, loff_t off, ino_t ino, unsigned ftype);
+ +int novfs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir);
+ +int novfs_dir_fsync(struct file *file, int datasync);
+ +
+ +/*
+ + * Declared address space operations
+ + */
+ +int novfs_a_writepage(struct page *page, struct writeback_control *wbc);
+ +int novfs_a_writepages(struct address_space *mapping, struct writeback_control *wbc);
+ +int novfs_a_write_begin(struct file *file, struct address_space *mapping,
+ +                      loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata);
+ +int novfs_a_write_end(struct file *file, struct address_space *mapping,
+ +                    loff_t pos, unsigned len, unsigned copied, struct page *pagep, void *fsdata);
+ +int novfs_a_readpage(struct file *file, struct page *page);
+ +int novfs_a_readpages(struct file *file, struct address_space *mapping, struct list_head *page_lst, unsigned nr_pages);
+ +ssize_t novfs_a_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
+ +
+ +/*
+ + * Declared file_operations
+ + */
+ +ssize_t novfs_f_read(struct file *, char *, size_t, loff_t *);
+ +ssize_t novfs_f_write(struct file *, const char *, size_t, loff_t *);
+ +int novfs_f_readdir(struct file *, void *, filldir_t);
+ +int novfs_f_mmap(struct file *file, struct vm_area_struct *vma);
+ +int novfs_f_open(struct inode *, struct file *);
+ +int novfs_f_flush(struct file *, fl_owner_t);
+ +int novfs_f_release(struct inode *, struct file *);
+ +int novfs_f_fsync(struct file *, int datasync);
+ +int novfs_f_lock(struct file *, int, struct file_lock *);
+ +
+ +/*
+ + * Declared inode_operations
+ + */
+ +int novfs_i_create(struct inode *, struct dentry *, int, struct nameidata *);
+ +struct dentry *novfs_i_lookup(struct inode *, struct dentry *, struct nameidata *);
+ +int novfs_i_mkdir(struct inode *, struct dentry *, int);
+ +int novfs_i_unlink(struct inode *dir, struct dentry *dentry);
+ +int novfs_i_rmdir(struct inode *, struct dentry *);
+ +int novfs_i_mknod(struct inode *, struct dentry *, int, dev_t);
+ +int novfs_i_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
+ +int novfs_i_setattr(struct dentry *, struct iattr *);
+ +int novfs_i_getattr(struct vfsmount *mnt, struct dentry *, struct kstat *);
+ +int novfs_i_revalidate(struct dentry *dentry);
+ +
+ +/*
+ + * Extended attributes operations
+ + */
+ +
+ +ssize_t novfs_i_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size);
+ +int novfs_i_setxattr(struct dentry *dentry, const char *name, const void *value, size_t value_size, int flags);
+ +ssize_t novfs_i_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
+ +
+ +void update_inode(struct inode *Inode, struct novfs_entry_info *Info);
+ +
+ +/*
+ + * Declared super_operations
+ + */
+ +void novfs_read_inode(struct inode *inode);
+ +void novfs_write_inode(struct inode *inode);
+ +int novfs_notify_change(struct dentry *dentry, struct iattr *attr);
+ +void novfs_evict_inode(struct inode *inode);
+ +int novfs_show_options(struct seq_file *s, struct vfsmount *m);
+ +
+ +int novfs_statfs(struct dentry *de, struct kstatfs *buf);
+ +
+ +/*
+ + * Declared control interface functions
+ + */
+ +ssize_t novfs_control_Read(struct file *file, char *buf, size_t nbytes, loff_t * ppos);
+ +
+ +ssize_t novfs_control_write(struct file *file, const char *buf, size_t nbytes, loff_t * ppos);
+ +
+ +int __init init_novfs(void);
+ +void __exit exit_novfs(void);
+ +
+ +int novfs_lock_inode_cache(struct inode *i);
+ +void novfs_unlock_inode_cache(struct inode *i);
+ +int novfs_enumerate_inode_cache(struct inode *i, struct list_head **iteration, ino_t * ino, struct novfs_entry_info *info);
+ +int novfs_get_entry(struct inode *i, struct qstr *name, ino_t * ino, struct novfs_entry_info *info);
+ +int novfs_get_entry_by_pos(struct inode *i, loff_t pos, ino_t * ino, struct novfs_entry_info *info);
+ +int novfs_get_entry_time(struct inode *i, struct qstr *name, ino_t * ino, struct novfs_entry_info *info, u64 * EntryTime);
+ +int novfs_get_remove_entry(struct inode *i, ino_t * ino, struct novfs_entry_info *info);
+ +void novfs_invalidate_inode_cache(struct inode *i);
+ +struct novfs_dir_cache *novfs_lookup_inode_cache(struct inode *i, struct qstr *name, ino_t ino);
+ +int novfs_lookup_validate(struct inode *i, struct qstr *name, ino_t ino);
+ +int novfs_add_inode_entry(struct inode *i, struct qstr *name, ino_t ino, struct novfs_entry_info *info);
+ +int novfs_update_entry(struct inode *i, struct qstr *name, ino_t ino, struct novfs_entry_info *info);
+ +void novfs_remove_inode_entry(struct inode *i, struct qstr *name, ino_t ino);
+ +void novfs_free_invalid_entries(struct inode *i);
+ +void novfs_free_inode_cache(struct inode *i);
+ +
+ +/*===[ Global variables ]=================================================*/
+ +struct dentry_operations novfs_dentry_operations = {
+ +      .d_revalidate = novfs_d_revalidate,
+ +      .d_hash = novfs_d_hash,
+ +      .d_compare = novfs_d_compare,
+ +      //.d_delete      = novfs_d_delete,
+ +      .d_release = novfs_d_release,
+ +      .d_iput = novfs_d_iput,
+ +};
+ +
+ +struct file_operations novfs_dir_operations = {
+ +      .owner = THIS_MODULE,
+ +      .open = novfs_dir_open,
+ +      .release = novfs_dir_release,
+ +      .llseek = novfs_dir_lseek,
+ +      .read = novfs_dir_read,
+ +      .readdir = novfs_dir_readdir,
+ +      .fsync = novfs_dir_fsync,
+ +};
+ +
+ +static struct file_operations novfs_file_operations = {
+ +      .owner = THIS_MODULE,
+ +      .read = novfs_f_read,
+ +      .write = novfs_f_write,
+ +      .readdir = novfs_f_readdir,
+ +      .mmap = novfs_f_mmap,
+ +      .open = novfs_f_open,
+ +      .flush = novfs_f_flush,
+ +      .release = novfs_f_release,
+ +      .fsync = novfs_f_fsync,
+ +      .llseek = generic_file_llseek,
+ +      .lock = novfs_f_lock,
+ +};
+ +
+ +static struct address_space_operations novfs_nocache_aops = {
+ +      .readpage = novfs_a_readpage,
+ +};
+ +
+ +struct backing_dev_info novfs_backing_dev_info = {
+ +      .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
+ +      .state = 0,
+ +      .capabilities = BDI_CAP_NO_WRITEBACK | BDI_CAP_MAP_COPY,
+ +};
+ +
+ +static struct address_space_operations novfs_aops = {
+ +      .readpage = novfs_a_readpage,
+ +      .readpages = novfs_a_readpages,
+ +      .writepage = novfs_a_writepage,
+ +      .writepages = novfs_a_writepages,
+ +      .write_begin = novfs_a_write_begin,
+ +      .write_end = novfs_a_write_end,
+ +      .set_page_dirty = __set_page_dirty_nobuffers,
+ +      .direct_IO = novfs_a_direct_IO,
+ +};
+ +
+ +static struct inode_operations novfs_inode_operations = {
+ +      .create = novfs_i_create,
+ +      .lookup = novfs_i_lookup,
+ +      .unlink = novfs_i_unlink,
+ +      .mkdir = novfs_i_mkdir,
+ +      .rmdir = novfs_i_rmdir,
+ +      .mknod = novfs_i_mknod,
+ +      .rename = novfs_i_rename,
+ +      .setattr = novfs_i_setattr,
+ +      .getattr = novfs_i_getattr,
+ +      .getxattr = novfs_i_getxattr,
+ +      .setxattr = novfs_i_setxattr,
+ +      .listxattr = novfs_i_listxattr,
+ +};
+ +
+ +static struct inode_operations novfs_file_inode_operations = {
+ +      .setattr = novfs_i_setattr,
+ +      .getattr = novfs_i_getattr,
+ +      .getxattr = novfs_i_getxattr,
+ +      .setxattr = novfs_i_setxattr,
+ +      .listxattr = novfs_i_listxattr,
+ +};
+ +
+ +static struct super_operations novfs_ops = {
+ +      .statfs = novfs_statfs,
+ +      .evict_inode = novfs_evict_inode,
+ +      .drop_inode = generic_delete_inode,
+ +      .show_options = novfs_show_options,
+ +
+ +};
+ +
++/* Not currently used
++static struct file_operations novfs_Control_operations = {
++   .read    = novfs_Control_read,
++   .write   = novfs_Control_write,
++};
++*/
++
+ +static atomic_t novfs_Inode_Number = ATOMIC_INIT(0);
+ +
+ +struct dentry *novfs_root = NULL;
+ +char *novfs_current_mnt = NULL;
+ +
+ +DEFINE_MUTEX(InodeList_lock);
+ +
+ +LIST_HEAD(InodeList);
+ +
+ +DEFINE_MUTEX(TimeDir_Lock);
+ +uint64_t lastTime;
+ +char lastDir[PATH_MAX];
+ +
+ +uint64_t inHAXTime;
+ +int inHAX;
+ +
+ +unsigned long InodeCount = 0, DCCount = 0;
+ +unsigned long novfs_update_timeout = FILE_UPDATE_TIMEOUT;
+ +int novfs_page_cache = 0;
+ +
+ +struct file_private {
+ +      int listedall;
+ +      void *enumHandle;
+ +};
+ +
+ +static void PRINT_DENTRY(const char *s, struct dentry *d)
+ +{
+ +      __DbgPrint("%s: 0x%p\n", s, d);
+ +      __DbgPrint("   d_count:      0x%x\n", d->d_count);
+ +      __DbgPrint("   d_lock:       0x%x\n", d->d_lock);
+ +      __DbgPrint("   d_inode:      0x%x\n", d->d_inode);
+ +      __DbgPrint("   d_lru:        0x%p\n"
+ +                 "      next:      0x%p\n" "      prev:      0x%p\n", &d->d_lru, d->d_lru.next, d->d_lru.prev);
+ +      __DbgPrint("   d_child:      0x%p\n" "      next:      0x%p\n"
+ +                 "      prev:      0x%p\n", &d->d_u.d_child, d->d_u.d_child.next, d->d_u.d_child.prev);
+ +      __DbgPrint("   d_subdirs:    0x%p\n" "      next:      0x%p\n"
+ +                 "      prev:      0x%p\n", &d->d_subdirs, d->d_subdirs.next, d->d_subdirs.prev);
+ +      __DbgPrint("   d_alias:      0x%p\n" "      next:      0x%p\n"
+ +                 "      prev:      0x%p\n", &d->d_alias, d->d_alias.next, d->d_alias.prev);
+ +      __DbgPrint("   d_time:       0x%x\n", d->d_time);
+ +      __DbgPrint("   d_op:         0x%p\n", d->d_op);
+ +      __DbgPrint("   d_sb:         0x%p\n", d->d_sb);
+ +      __DbgPrint("   d_flags:      0x%x\n", d->d_flags);
+ +      __DbgPrint("   d_fsdata:     0x%p\n", d->d_fsdata);
+ +/*   DbgPrint("   d_cookie:     0x%x\n", d->d_cookie); */
+ +      __DbgPrint("   d_parent:     0x%p\n", d->d_parent);
+ +      __DbgPrint("   d_name:       0x%p %.*s\n", &d->d_name, d->d_name.len, d->d_name.name);
+ +      __DbgPrint("      name:      0x%p\n" "      len:       %d\n"
+ +                 "      hash:      0x%x\n", d->d_name.name, d->d_name.len, d->d_name.hash);
+ +      __DbgPrint("   d_hash:       0x%x\n" "      next:      0x%x\n"
+ +                 "      pprev:     0x%x\n", d->d_hash, d->d_hash.next, d->d_hash.pprev);
+ +}
+ +
+ +/*++======================================================================*/
+ +int novfs_remove_from_root(char *RemoveName)
+ +{
+ +      struct qstr name;
+ +      struct dentry *dentry;
+ +      struct inode *dir;
+ +
+ +      DbgPrint("%s", RemoveName);
+ +      name.len = strlen(RemoveName);
+ +      name.name = RemoveName;
+ +      novfs_d_hash(novfs_root, novfs_root->d_inode, &name);
+ +
+ +      dentry = d_lookup(novfs_root, &name);
+ +      if (dentry) {
+ +              if (dentry->d_inode && dentry->d_inode->i_private) {
+ +                      struct inode_data *n_inode = dentry->d_inode->i_private;
+ +                      n_inode->Scope = NULL;
+ +              }
+ +              dput(dentry);
+ +      }
+ +
+ +      dir = novfs_root->d_inode;
+ +
+ +      novfs_lock_inode_cache(dir);
+ +      novfs_remove_inode_entry(dir, &name, 0);
+ +      novfs_unlock_inode_cache(dir);
+ +
+ +      return (0);
+ +}
+ +
+ +/*++======================================================================*/
+ +int novfs_add_to_root(char *AddName)
+ +{
+ +      struct qstr name;
+ +      struct inode *dir;
+ +      struct novfs_entry_info info;
+ +      ino_t ino;
+ +
+ +      DbgPrint("%s", AddName);
+ +      name.len = strlen(AddName);
+ +      name.name = AddName;
+ +      novfs_d_hash(novfs_root, novfs_root->d_inode, &name);
+ +
+ +      dir = novfs_root->d_inode;
+ +
+ +      novfs_lock_inode_cache(dir);
+ +
+ +      ino = 0;
+ +
+ +      if (!novfs_lookup_inode_cache(dir, &name, 0)) {
+ +              info.mode = S_IFDIR | 0700;
+ +              info.size = 0;
+ +              info.atime = info.ctime = info.mtime = CURRENT_TIME;
+ +
+ +              ino = (ino_t) atomic_inc_return(&novfs_Inode_Number);
+ +              novfs_add_inode_entry(dir, &name, ino, &info);
+ +      }
+ +
+ +      novfs_unlock_inode_cache(dir);
+ +
+ +      return (0);
+ +}
+ +
+ +/*++======================================================================*/
+ +int novfs_Add_to_Root2(char *AddName)
+ +{
+ +      struct dentry *entry;
+ +      struct qstr name;
+ +      struct inode *inode;
+ +      void *scope;
+ +
+ +      DbgPrint("%s", AddName);
+ +      name.len = strlen(AddName);
+ +      name.name = AddName;
+ +
+ +      novfs_d_hash(novfs_root, novfs_root->d_inode, &name);
+ +
+ +      entry = d_lookup(novfs_root, &name);
+ +      DbgPrint("novfs_d_lookup 0x%p", entry);
+ +      if (NULL == entry) {
+ +              scope = novfs_scope_lookup();
+ +
+ +              entry = d_alloc(novfs_root, &name);
+ +              DbgPrint("d_alloc 0x%p", entry);
+ +              if (entry) {
+ +                      entry->d_op = &novfs_dentry_operations;
+ +                      entry->d_time = jiffies + (novfs_update_timeout * HZ);
+ +                      /*
+ +                       * done in novfs_d_add now... entry->d_fsdata = (void *)novfs_internal_hash( &name );
+ +                       */
+ +                      inode = novfs_get_inode(novfs_root->d_sb, S_IFDIR | 0700, 0, novfs_scope_get_uid(scope), 0, &name);
+ +                      DbgPrint("Inode=0x%p", inode);
+ +                      if (inode) {
+ +                              inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ +                              if (!novfs_d_add(novfs_root, entry, inode, 1)) {
+ +                                      if (inode->i_private) {
+ +                                              struct inode_data *n_inode = inode->i_private;
+ +                                              n_inode->Flags = USER_INODE;
+ +                                      }
+ +                                      PRINT_DENTRY("After novfs_d_add", entry);
+ +                              } else {
+ +                                      dput(entry);
+ +                                      iput(inode);
+ +                              }
+ +                      }
+ +              }
+ +      } else {
+ +              dput(entry);
+ +              PRINT_DENTRY("novfs_Add_to_Root: After dput Dentry", entry);
+ +      }
+ +      return (0);
+ +}
+ +
+ +char *novfs_dget_path(struct dentry *Dentry, char *Buf, unsigned int Buflen)
+ +{
+ +      char *retval = &Buf[Buflen];
+ +      struct dentry *p = Dentry;
+ +
+ +      *(--retval) = '\0';
+ +      Buflen--;
+ +
+ +      if (!IS_ROOT(p) && !IS_ROOT(p->d_parent)) {
+ +              while (Buflen && !IS_ROOT(p) && !IS_ROOT(p->d_parent)) {
+ +                      if (Buflen > p->d_name.len) {
+ +                              retval -= p->d_name.len;
+ +                              Buflen -= p->d_name.len;
+ +                              memcpy(retval, p->d_name.name, p->d_name.len);
+ +                              *(--retval) = '\\';
+ +                              Buflen--;
+ +                              p = p->d_parent;
+ +                      } else {
+ +                              retval = NULL;
+ +                              break;
+ +                      }
+ +              }
+ +      } else {
+ +              *(--retval) = '\\';
+ +      }
+ +
+ +      if (retval)
+ +              DbgPrint("%s", retval);
+ +      return (retval);
+ +}
+ +
+ +int verify_dentry(struct dentry *dentry, int Flags)
+ +{
+ +      int retVal = -ENOENT;
+ +      struct inode *dir;
+ +      struct novfs_entry_info *info = NULL;
+ +      struct inode_data *id;
+ +      struct novfs_schandle session;
+ +      char *path, *list = NULL, *cp;
+ +      ino_t ino = 0;
+ +      struct qstr name;
+ +      int iLock = 0;
+ +      struct dentry *parent = NULL;
+ +      u64 ctime;
+ +      struct inode *inode;
+ +
+ +      if (IS_ROOT(dentry)) {
+ +              DbgPrint("Root entry");
+ +              return (0);
+ +      }
+ +
+ +      if (dentry && dentry->d_parent && (dir = dentry->d_parent->d_inode) && (id = dir->i_private)) {
+ +              parent = dget_parent(dentry);
+ +
+ +              info = kmalloc(sizeof(struct novfs_entry_info) + PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +
+ +              if (info) {
+ +                      if (novfs_lock_inode_cache(dir)) {
+ +                              name.len = dentry->d_name.len;
+ +                              name.name = dentry->d_name.name;
+ +                              name.hash = novfs_internal_hash(&name);
+ +                              if (!novfs_get_entry_time(dir, &name, &ino, info, &ctime)) {
+ +                                      inode = dentry->d_inode;
+ +                                      if (inode && inode->i_private &&
+ +                                          ((inode->i_size != info->size) || (inode->i_mtime.tv_sec != info->mtime.tv_sec)
+ +                                           || (inode->i_mtime.tv_nsec != info->mtime.tv_nsec))) {
+ +                                              /*
+ +                                               * Values don't match so update.
+ +                                               */
+ +                                              struct inode_data *n_inode = inode->i_private;
+ +                                              n_inode->Flags |= UPDATE_INODE;
+ +                                      }
+ +
+ +                                      ctime = get_jiffies_64() - ctime;
+ +                                      if (Flags || ctime < (u64) (novfs_update_timeout * HZ)) {
+ +                                              retVal = 0;
+ +                                              novfs_unlock_inode_cache(dir);
+ +                                              dput(parent);
+ +                                              kfree(info);
+ +                                              return (0);
+ +                                      }
+ +                              }
+ +                              novfs_unlock_inode_cache(dir);
+ +                      }
+ +
+ +                      if (IS_ROOT(dentry->d_parent)) {
+ +                              session = novfs_scope_get_sessionId(novfs_get_scope_from_name(&dentry->d_name));
+ +                      } else
+ +                              session = novfs_scope_get_sessionId(id->Scope);
+ +
+ +                      if (!SC_PRESENT(session)) {
+ +                              id->Scope = novfs_get_scope(dentry);
+ +                              session = novfs_scope_get_sessionId(id->Scope);
+ +                      }
+ +
+ +                      ino = 0;
+ +                      retVal = 0;
+ +
+ +                      if (IS_ROOT(dentry->d_parent)) {
+ +                              DbgPrint("parent is Root directory");
+ +                              list = novfs_get_scopeusers();
+ +
+ +                              iLock = novfs_lock_inode_cache(dir);
+ +                              novfs_invalidate_inode_cache(dir);
+ +
+ +                              if (list) {
+ +                                      cp = list;
+ +                                      while (*cp) {
+ +                                              name.name = cp;
+ +                                              name.len = strlen(cp);
+ +                                              name.hash = novfs_internal_hash(&name);
+ +                                              cp += (name.len + 1);
+ +                                              ino = 0;
+ +                                              if (novfs_get_entry(dir, &name, &ino, info)) {
+ +                                                      info->mode = S_IFDIR | 0700;
+ +                                                      info->size = 0;
+ +                                                      info->atime = info->ctime = info->mtime = CURRENT_TIME;
+ +                                                      ino = (ino_t) atomic_inc_return(&novfs_Inode_Number);
+ +                                                      novfs_add_inode_entry(dir, &name, ino, info);
+ +                                              }
+ +                                      }
+ +                              }
+ +                              novfs_free_invalid_entries(dir);
+ +                      } else {
+ +
+ +                              path = novfs_dget_path(dentry, info->name, PATH_LENGTH_BUFFER);
+ +                              if (path) {
+ +                                      if (dentry->d_name.len <= NW_MAX_PATH_LENGTH) {
+ +                                              name.hash = novfs_internal_hash(&dentry->d_name);
+ +                                              name.len = dentry->d_name.len;
+ +                                              name.name = dentry->d_name.name;
+ +
+ +                                              retVal = novfs_get_file_info(path, info, session);
+ +                                              if (0 == retVal) {
+ +                                                      dentry->d_time = jiffies + (novfs_update_timeout * HZ);
+ +                                                      iLock = novfs_lock_inode_cache(dir);
+ +                                                      if (novfs_update_entry(dir, &name, 0, info)) {
+ +                                                              if (dentry->d_inode) {
+ +                                                                      ino = dentry->d_inode->i_ino;
+ +                                                              } else {
+ +                                                                      ino = (ino_t) atomic_inc_return(&novfs_Inode_Number);
+ +                                                              }
+ +                                                              novfs_add_inode_entry(dir, &name, ino, info);
+ +                                                      }
+ +                                                      if (dentry->d_inode) {
+ +                                                              update_inode(dentry->d_inode, info);
+ +                                                              id->Flags &= ~UPDATE_INODE;
+ +
+ +                                                              dentry->d_inode->i_flags &= ~S_DEAD;
+ +                                                              if (dentry->d_inode->i_private) {
+ +                                                                      ((struct inode_data *)dentry->d_inode->i_private)->Scope =
+ +                                                                          id->Scope;
+ +                                                              }
+ +                                                      }
+ +                                              } else if (-EINTR != retVal) {
+ +                                                      retVal = 0;
+ +                                                      iLock = novfs_lock_inode_cache(dir);
+ +                                                      novfs_remove_inode_entry(dir, &name, 0);
+ +                                                      if (dentry->d_inode && !(dentry->d_inode->i_flags & S_DEAD)) {
+ +                                                              dentry->d_inode->i_flags |= S_DEAD;
+ +                                                              dentry->d_inode->i_size = 0;
+ +                                                              dentry->d_inode->i_atime.tv_sec =
+ +                                                                  dentry->d_inode->i_atime.tv_nsec =
+ +                                                                  dentry->d_inode->i_ctime.tv_sec =
+ +                                                                  dentry->d_inode->i_ctime.tv_nsec =
+ +                                                                  dentry->d_inode->i_mtime.tv_sec =
+ +                                                                  dentry->d_inode->i_mtime.tv_nsec = 0;
+ +                                                              dentry->d_inode->i_blocks = 0;
+ +                                                              d_delete(dentry);       /* Remove from cache */
+ +                                                      }
+ +                                              }
+ +                                      } else {
+ +                                              retVal = -ENAMETOOLONG;
+ +                                      }
+ +                              }
+ +                      }
+ +              } else {
+ +                      retVal = -ENOMEM;
+ +              }
+ +              if (iLock) {
+ +                      novfs_unlock_inode_cache(dir);
+ +              }
+ +              dput(parent);
+ +      }
+ +
+ +      if (list)
+ +              kfree(list);
+ +      if (info)
+ +              kfree(info);
+ +
+ +      DbgPrint("return=0x%x", retVal);
+ +
+ +      return (retVal);
+ +}
+ +
+ +static int novfs_d_add(struct dentry *Parent, struct dentry *d, struct inode *i, int a)
+ +{
+ +      void *scope;
+ +      struct inode_data *id = NULL;
+ +
+ +      char *path, *buf;
+ +
+ +      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +      if (buf) {
+ +              path = novfs_dget_path(d, buf, PATH_LENGTH_BUFFER);
+ +              if (path) {
+ +                      DbgPrint("inode=0x%p ino=%d path %s", i, i->i_ino, path);
+ +              }
+ +              kfree(buf);
+ +      }
+ +
+ +      if (Parent && Parent->d_inode && Parent->d_inode->i_private) {
+ +              id = (struct inode_data *)Parent->d_inode->i_private;
+ +      }
+ +
+ +      if (id && id->Scope) {
+ +              scope = id->Scope;
+ +      } else {
+ +              scope = novfs_get_scope(d);
+ +      }
+ +
+ +      ((struct inode_data *)i->i_private)->Scope = scope;
+ +
+ +      d->d_time = jiffies + (novfs_update_timeout * HZ);
+ +      if (a) {
+ +              d_add(d, i);
+ +      } else {
+ +              d_instantiate(d, i);
+ +      }
+ +
+ +      return (0);
+ +}
+ +
+ +int novfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+ +{
+ +      int retCode = 0;
+ +      struct inode *dir;
+ +      struct inode_data *id;
+ +      struct qstr name;
+ +
+ +      __DbgPrint("%s: 0x%p %.*s\n"
+ +                 "   d_count: %d\n"
+ +                 "   d_inode: 0x%p\n", __func__,
+ +                 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_count, dentry->d_inode);
+ +
+ +      if (IS_ROOT(dentry)) {
+ +              retCode = 1;
+ +      } else {
+ +              if (dentry->d_inode && dentry->d_parent && (dir = dentry->d_parent->d_inode) && (id = dir->i_private)) {
+ +                      /*
+ +                       * Check timer to see if in valid time limit
+ +                       */
+ +                      if (jiffies > dentry->d_time) {
+ +                              /*
+ +                               * Revalidate entry
+ +                               */
+ +                              name.len = dentry->d_name.len;
+ +                              name.name = dentry->d_name.name;
+ +                              name.hash = novfs_internal_hash(&dentry->d_name);
+ +                              dentry->d_time = 0;
+ +
+ +                              if (0 == verify_dentry(dentry, 0)) {
+ +                                      if (novfs_lock_inode_cache(dir)) {
+ +                                              if (novfs_lookup_inode_cache(dir, &name, 0)) {
+ +                                                      dentry->d_time = jiffies + (novfs_update_timeout * HZ);
+ +                                                      retCode = 1;
+ +                                              }
+ +                                              novfs_unlock_inode_cache(dir);
+ +                                      }
+ +                              }
+ +                      } else {
+ +                              retCode = 1;
+ +                      }
+ +              }
+ +      }
+ +
+ +      if ((0 == retCode) && dentry->d_inode) {
+ +              /*
+ +               * Entry has become invalid
+ +               */
+ +/*      dput(dentry);
+ +*/
+ +      }
+ +
+ +      DbgPrint("return 0x%x %.*s", retCode, dentry->d_name.len, dentry->d_name.name);
+ +
+ +      return (retCode);
+ +}
+ +
+ +static unsigned long novfs_internal_hash(struct qstr *name)
+ +{
+ +      unsigned long hash = 0;
+ +      unsigned int len = name->len;
+ +      unsigned char *c = (unsigned char *)name->name;
+ +
+ +      while (len--) {
+ +              /*
+ +               * Lower case values for the hash.
+ +               */
+ +              hash = partial_name_hash(tolower(*c++), hash);
+ +      }
+ +
+ +      return (hash);
+ +}
+ +
+ +int novfs_d_hash(const struct dentry *dentry, const struct inode *inode,
+ +               struct qstr *name)
+ +{
+ +      DbgPrint("%.*s", name->len, name->name);
+ +
+ +      name->hash = novfs_internal_hash(name);
+ +
+ +      return (0);
+ +}
+ +
+ +static int novfs_d_strcmp(const char *str1, unsigned int len,
+ +                        const struct qstr *s2)
+ +{
+ +      int retCode = 1;
+ +      const unsigned char *str2 = s2->name;
+ +
+ +      DbgPrint("s1=%.*s s2=%.*s", len, str1, s2->len, s2->name);
+ +
+ +      if (len && (len == s2->len)) {
+ +              for (retCode = 0; len--; str1++, str2++) {
+ +                      if (*str1 != *str2) {
+ +                              if (tolower(*str1) != tolower(*str2)) {
+ +                                      retCode = 1;
+ +                                      break;
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      DbgPrint("retCode=0x%x", retCode);
+ +      return (retCode);
+ +}
+ +
+ +int novfs_d_compare(const struct dentry *parent,
+ +                  const struct inode *parent_inode,
+ +                  const struct dentry *dentry, const struct inode *inode,
+ +                  unsigned int len, const char *s1, const struct qstr *s2)
+ +{
+ +      int retCode;
+ +
+ +      retCode = novfs_d_strcmp(s1, len, s2);
+ +
+ +      DbgPrint("retCode=0x%x", retCode);
+ +      return (retCode);
+ +}
+ +
+ +int novfs_d_delete(struct dentry *dentry)
+ +{
+ +      int retVal = 0;
+ +
+ +      DbgPrint("0x%p %.*s; d_count: %d; d_inode: 0x%p",
+ +               dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_count, dentry->d_inode);
+ +
+ +      if (dentry->d_inode && (dentry->d_inode->i_flags & S_DEAD)) {
+ +              retVal = 1;
+ +      }
+ +
+ +      dentry->d_time = 0;
+ +
+ +      return (retVal);
+ +}
+ +
+ +void novfs_d_release(struct dentry *dentry)
+ +{
+ +      DbgPrint("0x%p %.*s", dentry, dentry->d_name.len, dentry->d_name.name);
+ +}
+ +
+ +void novfs_d_iput(struct dentry *dentry, struct inode *inode)
+ +{
+ +      DbgPrint("Inode=0x%p Ino=%d Dentry=0x%p i_state=%d Name=%.*s",
+ +               inode, inode->i_ino, dentry, inode->i_state, dentry->d_name.len, dentry->d_name.name);
+ +
+ +      iput(inode);
+ +
+ +}
+ +
+ +int novfs_dir_open(struct inode *dir, struct file *file)
+ +{
+ +      char *path, *buf;
+ +      struct file_private *file_private = NULL;
+ +
+ +      DbgPrint("Inode 0x%p %d Name %.*s", dir, dir->i_ino, file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +
+ +      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +      if (buf) {
+ +              path = novfs_dget_path(file->f_dentry, buf, PATH_LENGTH_BUFFER);
+ +              if (path) {
+ +                      DbgPrint("path %s", path);
+ +              }
+ +              kfree(buf);
+ +      }
+ +
+ +      file_private = kmalloc(sizeof(struct file_private), GFP_KERNEL);
+ +      file_private->listedall = 0;
+ +      file_private->enumHandle = NULL;
+ +
+ +      file->private_data = file_private;
+ +
+ +      return (0);
+ +}
+ +
+ +int novfs_dir_release(struct inode *dir, struct file *file)
+ +{
+ +      struct file_private *file_private = file->private_data;
+ +      struct inode *inode = file->f_dentry->d_inode;
+ +      struct novfs_schandle sessionId;
+ +
+ +      DbgPrint("Inode 0x%p %d Name %.*s", dir, dir->i_ino, file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +
+ +      if (file_private) {
+ +              if (file_private->enumHandle && (file_private->enumHandle != ((void *)-1))) {
+ +                      sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      if (SC_PRESENT(sessionId) == 0) {
+ +                              ((struct inode_data *)inode->i_private)->Scope = novfs_get_scope(file->f_dentry);
+ +                              sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      }
+ +                      novfs_end_directory_enumerate(file_private->enumHandle, sessionId);
+ +              }
+ +              kfree(file_private);
+ +              file->private_data = NULL;
+ +      }
+ +
+ +      return (0);
+ +}
+ +
+ +loff_t novfs_dir_lseek(struct file * file, loff_t offset, int origin)
+ +{
+ +      struct file_private *file_private = NULL;
+ +
+ +      DbgPrint("offset %lld %d Name %.*s", offset, origin, file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +      //printk("<1> seekdir file = %.*s offset = %i\n", file->f_dentry->d_name.len, file->f_dentry->d_name.name, (int)offset);
+ +
+ +      if (0 != offset) {
+ +              return -ESPIPE;
+ +      }
+ +
+ +      file->f_pos = 0;
+ +
+ +      file_private = (struct file_private *)file->private_data;
+ +      file_private->listedall = 0;
+ +      if (file_private->enumHandle && (file_private->enumHandle != ((void *)-1))) {
+ +              struct novfs_schandle sessionId;
+ +              struct inode *inode = file->f_dentry->d_inode;
+ +              sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +              if (SC_PRESENT(sessionId) == 0) {
+ +                      ((struct inode_data *)inode->i_private)->Scope = novfs_get_scope(file->f_dentry);
+ +                      sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +              }
+ +              novfs_end_directory_enumerate(file_private->enumHandle, sessionId);
+ +      }
+ +      file_private->enumHandle = NULL;
+ +
+ +      return 0;
+ +      //return(default_llseek(file, offset, origin));
+ +}
+ +
+ +ssize_t novfs_dir_read(struct file * file, char *buf, size_t len, loff_t * off)
+ +{
+ +/*
+ +   int rlen = 0;
+ +
+ +   DbgPrint("dentry path %.*s buf=0x%p len=%d off=%lld", file->f_dentry->d_name.len, file->f_dentry->d_name.name, buf, len, *off);
+ +
+ +   if (0 == *off)
+ +   {
+ +      rlen = 8;
+ +      rlen -= copy_to_user(buf, "Testing\n", 8);
+ +      *off += rlen;
+ +   }
+ +   return(rlen);
+ +*/
+ +      DbgPrint("%lld %d Name %.*s", *off, len, file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +      return (generic_read_dir(file, buf, len, off));
+ +}
+ +
+ +static void novfs_Dump_Info(struct novfs_entry_info *info)
+ +{
+ +      char atime_buf[32], mtime_buf[32], ctime_buf[32];
+ +      char namebuf[512];
+ +      int len = 0;
+ +
+ +      if (info == NULL) {
+ +              DbgPrint("Dump_Info info == NULL");
+ +              return;
+ +      }
+ +
+ +      if (info->namelength >= 512) {
+ +              len = 511;
+ +      } else {
+ +              len = info->namelength;
+ +      }
+ +
+ +      memcpy(namebuf, info->name, len);
+ +      namebuf[len] = '\0';
+ +
+ +      ctime_r(&info->atime.tv_sec, atime_buf);
+ +      ctime_r(&info->mtime.tv_sec, mtime_buf);
+ +      ctime_r(&info->ctime.tv_sec, ctime_buf);
+ +      DbgPrint("type = %i", info->type);
+ +      DbgPrint("mode = %x", info->mode);
+ +      DbgPrint("uid = %d", info->uid);
+ +      DbgPrint("gid = %d", info->gid);
+ +      DbgPrint("size = %i", info->size);
+ +      DbgPrint("atime = %s", atime_buf);
+ +      DbgPrint("mtime = %s", mtime_buf);
+ +      DbgPrint("ctime = %s", ctime_buf);
+ +      DbgPrint("namelength = %i", info->namelength);
+ +      DbgPrint("name = %s", namebuf);
+ +}
+ +
+ +void processList(struct file *file, void *dirent, filldir_t filldir, char *list, int type, struct novfs_schandle SessionId)
+ +{
+ +      unsigned char *path, *buf = NULL, *cp;
+ +      struct qstr name;
+ +      struct novfs_entry_info *pinfo = NULL;
+ +
+ +      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +      path = buf;
+ +      if (buf) {
+ +              path = novfs_dget_path(file->f_dentry, buf, PATH_LENGTH_BUFFER);
+ +              if (path) {
+ +                      strcpy(buf, path);
+ +              }
+ +              path = buf + strlen(buf);
+ +              *path++ = '\\';
+ +      }
+ +
+ +      if (list) {
+ +              cp = list;
+ +              while (*cp) {
+ +                      name.name = cp;
+ +                      DbgPrint("name.name = %s", name.name);
+ +                      name.len = strlen(cp);
+ +                      name.hash = novfs_internal_hash(&name);
+ +                      cp += (name.len + 1);
+ +
+ +                      pinfo = kmalloc(sizeof(struct novfs_entry_info) + PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +                      pinfo->mode = S_IFDIR | 0700;
+ +                      pinfo->size = 0;
+ +                      pinfo->atime = pinfo->ctime = pinfo->mtime = CURRENT_TIME;
+ +                      strcpy(pinfo->name, name.name);
+ +                      pinfo->namelength = name.len;
+ +
+ +                      novfs_Dump_Info(pinfo);
+ +
+ +                      filldir(dirent, pinfo->name, pinfo->namelength, file->f_pos, file->f_pos, pinfo->mode >> 12);
+ +                      file->f_pos += 1;
+ +
+ +                      kfree(pinfo);
+ +              }
+ +      }
+ +
+ +      if (buf) {
+ +              kfree(buf);
+ +      }
+ +}
+ +
+ +int processEntries(struct file *file, void *dirent, filldir_t filldir, void **enumHandle, struct novfs_schandle sessionId)
+ +{
+ +      unsigned char *path = NULL, *buf = NULL;
+ +      int count = 0, status = 0;
+ +      struct novfs_entry_info *pinfo = NULL;
+ +      struct novfs_entry_info *pInfoMem = NULL;
+ +
+ +      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +      if (!buf) {
+ +              return -ENOMEM;
+ +      }
+ +
+ +      path = novfs_dget_path(file->f_dentry, buf, PATH_LENGTH_BUFFER);
+ +      if (!path) {
+ +              kfree(buf);
+ +              return -ENOMEM;
+ +      }
+ +      //NWSearchfiles
+ +      count = 0;
+ +      status = novfs_get_dir_listex(path, enumHandle, &count, &pinfo, sessionId);
+ +      pInfoMem = pinfo;
+ +
+ +      if ((count == -1) || (count == 0) || (status != 0)) {
+ +              kfree(pInfoMem);
+ +              kfree(buf);
+ +              return -1;
+ +      }
+ +      // parse resultset
+ +      while (pinfo && count--) {
+ +              filldir(dirent, pinfo->name, pinfo->namelength, file->f_pos, file->f_pos, pinfo->mode >> 12);
+ +              file->f_pos += 1;
+ +
+ +              pinfo = (struct novfs_entry_info *)(pinfo->name + pinfo->namelength);
+ +      }
+ +
+ +      kfree(pInfoMem);
+ +      kfree(buf);
+ +      return 0;
+ +}
+ +
+ +int novfs_dir_readdir(struct file *file, void *dirent, filldir_t filldir)
+ +{
+ +      unsigned char *list = NULL;
+ +      int status = 0;         //-ENOMEM;
+ +      struct inode *inode = file->f_dentry->d_inode;
+ +      struct novfs_schandle sessionId;
+ +      uid_t uid;
+ +      int type = 0;
+ +      struct file_private *file_private = NULL;
+ +      int lComm;
+ +
+ +      file_private = (struct file_private *)file->private_data;
+ +      DbgPrint("Name %.*s", file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +
+ +      //printk("<1> file = %.*s\n", file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +
+ +// Use this hack by default
+ +#ifndef SKIP_CROSSOVER_HACK
+ +      // Hack for crossover - begin
+ +      mutex_lock(&TimeDir_Lock);
+ +      if ((file->f_dentry->d_name.len == 7) &&
+ +          ((0 == strncmp(file->f_dentry->d_name.name, " !xover", 7)) ||
+ +           (0 == strncmp(file->f_dentry->d_name.name, "z!xover", 7)))) {
+ +              //printk("<1> xoverhack: we are in xoverHack\n");
+ +
+ +              inHAX = 1;
+ +              inHAXTime = get_nanosecond_time();
+ +              //up( &TimeDir_Lock );
+ +              //return 0;
+ +              file_private->listedall = 1;
+ +      } else {
+ +              if (inHAX) {
+ +                      if (get_nanosecond_time() - inHAXTime > 100 * 1000 * 1000) {
+ +                              //printk("<1> xoverhack: it was long, long, long ago...\n");
+ +                              inHAX = 0;
+ +                      } else {
+ +                              //printk("<1> xoverhack: word gotcha in xoverHack...\n");
+ +                              inHAXTime = get_nanosecond_time();
+ +                              //up( &TimeDir_Lock );
+ +                              //return 0;
+ +                              file_private->listedall = 1;
+ +                      }
+ +              }
+ +      }
+ +
+ +      mutex_unlock(&TimeDir_Lock);
+ +      // Hack for crossover - end
+ +#endif
+ +
+ +      if (file->f_pos == 0) {
+ +              if (filldir(dirent, ".", 1, file->f_pos, inode->i_ino, DT_DIR) < 0)
+ +                      return 1;
+ +              file->f_pos++;
+ +              return 1;
+ +      }
+ +
+ +      if (file->f_pos == 1) {
+ +              if (filldir(dirent, "..", 2, file->f_pos, file->f_dentry->d_parent->d_inode->i_ino, DT_DIR) < 0)
+ +                      return 1;
+ +              file->f_pos++;
+ +              return 1;
+ +      }
+ +
+ +      if (file_private->listedall != 0) {
+ +              return 0;
+ +      }
+ +
+ +      inode = file->f_dentry->d_inode;
+ +      if (inode && inode->i_private) {
+ +              sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +              if (0 == SC_PRESENT(sessionId)) {
+ +                      ((struct inode_data *)inode->i_private)->Scope = novfs_get_scope(file->f_dentry);
+ +                      sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +              }
+ +              uid = novfs_scope_get_uid(((struct inode_data *)inode->i_private)->Scope);
+ +      } else {
+ +              SC_INITIALIZE(sessionId);
+ +              uid = current_euid();
+ +      }
+ +
+ +      if (IS_ROOT(file->f_dentry) ||  // Root
+ +          IS_ROOT(file->f_dentry->d_parent) ||        // User
+ +          IS_ROOT(file->f_dentry->d_parent->d_parent))        // Server
+ +      {
+ +              if (IS_ROOT(file->f_dentry)) {
+ +                      DbgPrint("Root directory");
+ +                      list = novfs_get_scopeusers();
+ +                      type = USER_LIST;
+ +              } else if (IS_ROOT(file->f_dentry->d_parent)) {
+ +                      DbgPrint("Parent is Root directory");
+ +                      novfs_get_servers(&list, sessionId);
+ +                      type = SERVER_LIST;
+ +              } else {
+ +                      DbgPrint("Parent-Parent is Root directory");
+ +                      novfs_get_vols(&file->f_dentry->d_name, &list, sessionId);
+ +                      type = VOLUME_LIST;
+ +              }
+ +
+ +              processList(file, dirent, filldir, list, type, sessionId);
+ +              file_private->listedall = 1;
+ +      } else {
+ +              status = processEntries(file, dirent, filldir, &file_private->enumHandle, sessionId);
+ +
+ +              if (status != 0) {
+ +                      file_private->listedall = 1;
+ +#ifndef SKIP_CROSSOVER_HACK
+ +                      // Hack for crossover part 2 - begin
+ +                      lComm = strlen(current->comm);
+ +                      if ((lComm > 4)
+ +                          && (0 == strcmp(current->comm + lComm - 4, ".EXE"))) {
+ +                              if (filldir(dirent, " !xover", 7, file->f_pos, inode->i_ino, DT_DIR) < 0)
+ +                                      return 1;
+ +                              if (filldir(dirent, "z!xover", 7, file->f_pos, inode->i_ino, DT_DIR) < 0)
+ +                                      return 1;
+ +                              file->f_pos += 2;
+ +                      }
+ +                      // Hack for crossover part2 - end
+ +#endif
+ +              }
+ +      }
+ +
+ +      file->private_data = file_private;
+ +      return 1;
+ +}
+ +
+ +int novfs_dir_fsync(struct file *file, int datasync)
+ +{
+ +      DbgPrint("Name %.*s", file->f_dentry->d_name.len,
+ +               file->f_dentry->d_name.name);
+ +      return generic_file_fsync(file, datasync);
+ +}
+ +
+ +ssize_t novfs_f_read(struct file * file, char *buf, size_t len, loff_t * off)
+ +{
+ +      size_t thisread, totalread = 0;
+ +      loff_t offset = *off;
+ +      struct inode *inode;
+ +      struct novfs_schandle session;
+ +      struct inode_data *id;
+ +
+ +      if (file->f_dentry && (inode = file->f_dentry->d_inode) && (id = (struct inode_data *)inode->i_private)) {
+ +
+ +              DbgPrint("(0x%p 0x%p %d %lld %.*s)",
+ +                       file->private_data, buf, len, offset, file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +
+ +              if (novfs_page_cache && !(file->f_flags & O_DIRECT) && id->CacheFlag) {
+ +                      totalread = do_sync_read(file, buf, len, off);
+ +              } else {
+ +                      session = novfs_scope_get_sessionId(id->Scope);
+ +                      if (0 == SC_PRESENT(session)) {
+ +                              id->Scope = novfs_get_scope(file->f_dentry);
+ +                              session = novfs_scope_get_sessionId(id->Scope);
+ +                      }
+ +
+ +                      while (len > 0 && (offset < i_size_read(inode))) {
+ +                              int retval;
+ +                              thisread = len;
+ +                              retval = novfs_read_file(file->private_data, buf, &thisread, &offset, session);
+ +                              if (retval || !thisread) {
+ +                                      if (retval) {
+ +                                              totalread = retval;
+ +                                      }
+ +                                      break;
+ +                              }
+ +                              DbgPrint("thisread = 0x%x", thisread);
+ +                              len -= thisread;
+ +                              buf += thisread;
+ +                              offset += thisread;
+ +                              totalread += thisread;
+ +                      }
+ +                      *off = offset;
+ +              }
+ +      }
+ +      DbgPrint("return = %d", totalread);
+ +
+ +      return (totalread);
+ +}
+ +
+ +ssize_t novfs_f_write(struct file * file, const char *buf, size_t len, loff_t * off)
+ +{
+ +      ssize_t thiswrite, totalwrite = 0;
+ +      loff_t offset = *off;
+ +      struct novfs_schandle session;
+ +      struct inode *inode;
+ +      int status;
+ +      struct inode_data *id;
+ +
+ +      if (file->f_dentry && (inode = file->f_dentry->d_inode) && (id = file->f_dentry->d_inode->i_private)) {
+ +              DbgPrint("(0x%p 0x%p 0x%p %d %lld %.*s)",
+ +                       file->private_data, inode, id->FileHandle, len, offset,
+ +                       file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +
+ +              if (novfs_page_cache && !(file->f_flags & O_DIRECT) && id->CacheFlag && !(file->f_flags & O_WRONLY)) {
+ +                      totalwrite = do_sync_write(file, buf, len, off);
+ +              } else {
+ +                      if (file->f_flags & O_APPEND) {
+ +                              offset = i_size_read(inode);
+ +                              DbgPrint("appending to end %lld %.*s",
+ +                                       offset, file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +                      }
+ +
+ +                      session = novfs_scope_get_sessionId(id->Scope);
+ +                      if (0 == SC_PRESENT(session)) {
+ +                              id->Scope = novfs_get_scope(file->f_dentry);
+ +                              session = novfs_scope_get_sessionId(id->Scope);
+ +                      }
+ +
+ +                      while (len > 0) {
+ +                              thiswrite = len;
+ +                              if ((status =
+ +                                   novfs_write_file(file->private_data,
+ +                                                    (unsigned char *)buf, &thiswrite, &offset, session)) || !thiswrite) {
+ +                                      totalwrite = status;
+ +                                      break;
+ +                              }
+ +                              DbgPrint("thiswrite = 0x%x", thiswrite);
+ +                              len -= thiswrite;
+ +                              buf += thiswrite;
+ +                              offset += thiswrite;
+ +                              totalwrite += thiswrite;
+ +                              if (offset > i_size_read(inode)) {
+ +                                      i_size_write(inode, offset);
+ +                                      inode->i_blocks = (offset + inode->i_sb->s_blocksize - 1) >> inode->i_blkbits;
+ +                              }
+ +                              inode->i_mtime = inode->i_atime = CURRENT_TIME;
+ +                              id->Flags |= UPDATE_INODE;
+ +
+ +                      }
+ +                      *off = offset;
+ +              }
+ +      }
+ +      DbgPrint("return = 0x%x", totalwrite);
+ +
+ +      return (totalwrite);
+ +}
+ +
+ +int novfs_f_readdir(struct file *file, void *data, filldir_t fill)
+ +{
+ +      return -EISDIR;
+ +}
+ +
+ +int novfs_f_mmap(struct file *file, struct vm_area_struct *vma)
+ +{
+ +      int retCode = -EINVAL;
+ +
+ +      DbgPrint("file=0x%p %.*s", file, file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +
+ +      retCode = generic_file_mmap(file, vma);
+ +
+ +      DbgPrint("retCode=0x%x", retCode);
+ +      return (retCode);
+ +}
+ +
+ +int novfs_f_open(struct inode *inode, struct file *file)
+ +{
+ +      struct novfs_entry_info *info = NULL;
+ +      int retCode = -ENOENT;
+ +      struct novfs_schandle session;
+ +      char *path;
+ +      struct dentry *parent;
+ +      ino_t ino;
+ +      struct inode_data *id;
+ +      int errInfo;
+ +
+ +      DbgPrint("inode=0x%p file=0x%p dentry=0x%p dentry->d_inode=0x%p %.*s",
+ +               inode, file, file->f_dentry, file->f_dentry->d_inode, file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +      if (file->f_dentry) {
+ +              DbgPrint("%.*s f_flags=0%o f_mode=0%o i_mode=0%o",
+ +                       file->f_dentry->d_name.len, file->f_dentry->d_name.name, file->f_flags, file->f_mode, inode->i_mode);
+ +      }
+ +
+ +      if (inode && inode->i_private) {
+ +              id = (struct inode_data *)file->f_dentry->d_inode->i_private;
+ +              session = novfs_scope_get_sessionId(id->Scope);
+ +              if (0 == SC_PRESENT(session)) {
+ +                      id->Scope = novfs_get_scope(file->f_dentry);
+ +                      session = novfs_scope_get_sessionId(id->Scope);
+ +              }
+ +
+ +              info = kmalloc(sizeof(struct novfs_entry_info) + PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +              if (info) {
+ +                      path = novfs_dget_path(file->f_dentry, info->name, PATH_LENGTH_BUFFER);
+ +                      if (path) {
+ +                              if (file->f_flags & O_TRUNC) {
+ +                                      errInfo = novfs_get_file_info(path, info, session);
+ +
+ +                                      if (errInfo || info->size == 0) {
+ +                                              // clear O_TRUNC flag, bug #275366
+ +                                              file->f_flags = file->f_flags & (~O_TRUNC);
+ +                                      }
+ +                              }
+ +
+ +                              DbgPrint("%s", path);
+ +                              retCode = novfs_open_file(path, file->f_flags & ~O_EXCL, info, &file->private_data, session);
+ +
+ +                              DbgPrint("0x%x 0x%p", retCode, file->private_data);
+ +                              if (!retCode) {
+ +                                      /*
+ +                                       *update_inode(inode, &info);
+ +                                       */
+ +                                      //id->FileHandle = file->private_data;
+ +                                      id->CacheFlag = novfs_get_file_cache_flag(path, session);
+ +
+ +                                      if (!novfs_get_file_info(path, info, session)) {
+ +                                              update_inode(inode, info);
+ +                                      }
+ +
+ +                                      parent = dget_parent(file->f_dentry);
+ +
+ +                                      if (parent && parent->d_inode) {
+ +                                              struct inode *dir = parent->d_inode;
+ +                                              novfs_lock_inode_cache(dir);
+ +                                              ino = 0;
+ +                                              if (novfs_get_entry(dir, &file->f_dentry->d_name, &ino, info)) {
+ +                                                      ((struct inode_data *)inode->i_private)->Flags |= UPDATE_INODE;
+ +                                              }
+ +
+ +                                              novfs_unlock_inode_cache(dir);
+ +                                      }
+ +                                      dput(parent);
+ +                              }
+ +                      }
+ +                      kfree(info);
+ +              }
+ +      }
+ +      DbgPrint("retCode=0x%x", retCode);
+ +      return (retCode);
+ +}
+ +
+ +int novfs_flush_mapping(void *Handle, struct address_space *mapping, struct novfs_schandle Session)
+ +{
+ +      struct pagevec pagevec;
+ +      unsigned nrpages;
+ +      pgoff_t index = 0;
+ +      int done, rc = 0;
+ +
+ +      pagevec_init(&pagevec, 0);
+ +
+ +      do {
+ +              done = 1;
+ +              nrpages = pagevec_lookup_tag(&pagevec, mapping, &index, PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE);
+ +
+ +              if (nrpages) {
+ +                      struct page *page;
+ +                      int i;
+ +
+ +                      DbgPrint("%u", nrpages);
+ +
+ +                      done = 0;
+ +                      for (i = 0; !rc && (i < nrpages); i++) {
+ +                              page = pagevec.pages[i];
+ +
+ +                              DbgPrint("page 0x%p %lu", page, page->index);
+ +
+ +                              lock_page(page);
+ +                              page_cache_get(page);
+ +                              if (page->mapping == mapping) {
+ +                                      if (clear_page_dirty_for_io(page)) {
+ +                                              rc = novfs_write_page(Handle, page, Session);
+ +                                              if (!rc) {
+ +                                                      //ClearPageDirty(page);
+ +                                                      radix_tree_tag_clear
+ +                                                          (&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY);
+ +                                              }
+ +                                      }
+ +                              }
+ +
+ +                              page_cache_release(page);
+ +                              unlock_page(page);
+ +                      }
+ +                      pagevec_release(&pagevec);
+ +              }
+ +      } while (!rc && !done);
+ +
+ +      DbgPrint("return %d", rc);
+ +
+ +      return (rc);
+ +}
+ +
+ +int novfs_f_flush(struct file *file, fl_owner_t ownid)
+ +{
+ +
+ +      int rc = 0;
+ +#ifdef FLUSH
+ +      struct inode *inode;
+ +      struct novfs_schandle session;
+ +      struct inode_data *id;
+ +
+ +      DbgPrint("Called from 0x%p", __builtin_return_address(0));
+ +      if (file->f_dentry && (inode = file->f_dentry->d_inode)
+ +          && (id = file->f_dentry->d_inode->i_private)) {
+ +
+ +              if ((file->f_flags & O_ACCMODE) != O_RDONLY) {
+ +                      inode = file->f_dentry->d_inode;
+ +                      DbgPrint("%.*s f_flags=0%o f_mode=0%o i_mode=0%o",
+ +                               file->f_dentry->d_name.len,
+ +                               file->f_dentry->d_name.name, file->f_flags, file->f_mode, inode->i_mode);
+ +
+ +                      session = novfs_scope_get_sessionId(id->Scope);
+ +                      if (0 == SC_PRESENT(session)) {
+ +                              id->Scope = novfs_get_scope(file->f_dentry);
+ +                              session = novfs_scope_get_sessionId(id->Scope);
+ +                      }
+ +
+ +                      if (inode && inode->i_mapping && inode->i_mapping->nrpages) {
+ +
+ +                              DbgPrint("%.*s pages=%lu",
+ +                                       file->f_dentry->d_name.len, file->f_dentry->d_name.name, inode->i_mapping->nrpages);
+ +
+ +                              if (file->f_dentry &&
+ +                                  file->f_dentry->d_inode &&
+ +                                  file->f_dentry->d_inode->i_mapping &&
+ +                                  file->f_dentry->d_inode->i_mapping->a_ops &&
+ +                                  file->f_dentry->d_inode->i_mapping->a_ops->writepage) {
+ +                                      rc = filemap_fdatawrite(file->f_dentry->d_inode->i_mapping);
+ +                              } else {
+ +                                      rc = novfs_flush_mapping(file->private_data, file->f_dentry->d_inode->i_mapping, session);
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +#endif
+ +      return (rc);
+ +}
+ +
+ +int novfs_f_release(struct inode *inode, struct file *file)
+ +{
+ +      int retCode = -EACCES;
+ +      struct novfs_schandle session;
+ +      struct inode_data *id;
+ +
+ +      DbgPrint("path=%.*s handle=%p", file->f_dentry->d_name.len, file->f_dentry->d_name.name, file->private_data);
+ +
+ +      if (inode && (id = inode->i_private)) {
+ +              session = novfs_scope_get_sessionId(id->Scope);
+ +              if (0 == SC_PRESENT(session)) {
+ +                      id->Scope = novfs_get_scope(file->f_dentry);
+ +                      session = novfs_scope_get_sessionId(id->Scope);
+ +              }
+ +
+ +              if ((file->f_flags & O_ACCMODE) != O_RDONLY) {
+ +                      DbgPrint("%.*s f_flags=0%o f_mode=0%o i_mode=0%o",
+ +                               file->f_dentry->d_name.len,
+ +                               file->f_dentry->d_name.name, file->f_flags, file->f_mode, inode->i_mode);
+ +
+ +                      if (inode->i_mapping && inode->i_mapping->nrpages) {
+ +
+ +                              DbgPrint("%.*s pages=%lu",
+ +                                       file->f_dentry->d_name.len, file->f_dentry->d_name.name, inode->i_mapping->nrpages);
+ +
+ +                              if (inode->i_mapping->a_ops && inode->i_mapping->a_ops->writepage) {
+ +                                      filemap_fdatawrite(file->f_dentry->d_inode->i_mapping);
+ +                              } else {
+ +                                      novfs_flush_mapping(file->private_data, file->f_dentry->d_inode->i_mapping, session);
+ +                              }
+ +                      }
+ +              }
+ +
+ +              if (file->f_dentry && file->f_dentry->d_inode) {
+ +                      invalidate_remote_inode(file->f_dentry->d_inode);
+ +              }
+ +
+ +              retCode = novfs_close_file(file->private_data, session);
+ +              //id->FileHandle = 0;
+ +      }
+ +      return (retCode);
+ +}
+ +
+ +int novfs_f_fsync(struct file *file, int datasync)
+ +{
+ +      return 0;
+ +}
+ +
+ +int novfs_f_llseek(struct file *file, loff_t offset, int origin)
+ +{
+ +      DbgPrint("File=0x%p Name=%.*s offset=%lld origin=%d",
+ +               file, file->f_dentry->d_name.len, file->f_dentry->d_name.name, offset, origin);
+ +      return (generic_file_llseek(file, offset, origin));
+ +}
+ +
+ +/*++======================================================================*/
+ +int novfs_f_lock(struct file *file, int cmd, struct file_lock *lock)
+ +/*
+ + *  Arguments:
+ + *      "file" - pointer to file structure - contains file handle in "file->private_data"
+ + *
+ + *      "cmd" could be F_SETLK, F_SETLKW, F_GETLK
+ + *      F_SETLK/F_SETLKW are for setting/unsetting file lock
+ + *      F_GETLK is for getting infomation about region - is it locked, or not
+ + *
+ + *      "lock" structure - contains "start" and "end" of locking region
+ + *
+ + *  Returns:
+ + *      0 on success
+ + *      -ENOSYS on F_GETLK cmd. It's not implemented.
+ + *      -EINVAL if (lock->fl_start > lock->fl_end)
+ + *      -EAGAIN on all other errors
+ + *  Abstract:
+ + *
+ + *  Notes:
+ + *      "lock->fl_start" and "lock->fl_end" are of type "long long",
+ + *      but xtier functions in novfsd "NCFsdLockFile" and "NCFsdUnlockFile"
+ + *      receive arguments in u64 type.
+ + *
+ + *
+ + *========================================================================*/
+ +{
+ +      int err_code;
+ +
+ +      struct inode *inode;
+ +      struct novfs_schandle session;
+ +      struct inode_data *id;
+ +      loff_t len;
+ +
+ +      DbgPrint("(0x%p): begin in novfs_f_lock 0x%p", __builtin_return_address(0), file->private_data);
+ +      DbgPrint("cmd = %d, F_GETLK = %d, F_SETLK = %d, F_SETLKW = %d", cmd, F_GETLK, F_SETLK, F_SETLKW);
+ +      DbgPrint("lock->fl_start = 0x%llX, lock->fl_end = 0x%llX", lock->fl_start, lock->fl_end);
+ +
+ +      err_code = -1;
+ +      if (lock->fl_start <= lock->fl_end) {
+ +              /* Get len from "start" and "end" */
+ +              len = lock->fl_end - lock->fl_start + 1;
+ +              if ((0 == lock->fl_start) && (OFFSET_MAX == lock->fl_end)) {
+ +                      len = 0;
+ +              }
+ +
+ +              if (file->f_dentry && (inode = file->f_dentry->d_inode) && (id = (struct inode_data *)inode->i_private)) {
+ +                      DbgPrint("(0x%p 0x%p %.*s)",
+ +                               file->private_data, inode, file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ +
+ +                      session = novfs_scope_get_sessionId(id->Scope);
+ +                      if (0 == SC_PRESENT(session)) {
+ +                              id->Scope = novfs_get_scope(file->f_dentry);
+ +                              session = novfs_scope_get_sessionId(id->Scope);
+ +                      }
+ +
+ +                      /* fl_type = F_RDLCK, F_WRLCK, F_UNLCK */
+ +                      switch (cmd) {
+ +                      case F_SETLK:
+ +#ifdef F_GETLK64
+ +                      case F_SETLK64:
+ +#endif
+ +
+ +                              err_code = novfs_set_file_lock(session, file->private_data, lock->fl_type, lock->fl_start, len);
+ +                              break;
+ +
+ +                      case F_SETLKW:
+ +#ifdef F_GETLK64
+ +                      case F_SETLKW64:
+ +#endif
+ +                              err_code = novfs_set_file_lock(session, file->private_data, lock->fl_type, lock->fl_start, len);
+ +                              break;
+ +
+ +                      case F_GETLK:
+ +#ifdef F_GETLK64
+ +                      case F_GETLK64:
+ +#endif
+ +                              err_code = -ENOSYS;
+ +                              /*
+ +                               * Not implemented. We doesn't have appropriate xtier function.
+ +                               * */
+ +                              break;
+ +
+ +                      default:
+ +                              printk("<1> novfs in novfs_f_lock, not implemented cmd = %d\n", cmd);
+ +                              DbgPrint("novfs in novfs_f_lock, not implemented cmd = %d", cmd);
+ +                              break;
+ +                      }
+ +              }
+ +
+ +              DbgPrint("lock->fl_type = %u, err_code 0x%X", lock->fl_type, err_code);
+ +
+ +              if ((err_code != 0) && (err_code != -1)
+ +                  && (err_code != -ENOSYS)) {
+ +                      err_code = -EAGAIN;
+ +              }
+ +      } else {
+ +              err_code = -EINVAL;
+ +      }
+ +
+ +      return (err_code);
+ +}
+ +
+ +/*++======================================================================*/
+ +static void novfs_copy_cache_pages(struct address_space *mapping,
+ +                                 struct list_head *pages, int bytes_read, char *data, struct pagevec *plru_pvec)
+ +{
+ +      struct page *page;
+ +      char *target;
+ +
+ +      while (bytes_read > 0) {
+ +              if (list_empty(pages))
+ +                      break;
+ +
+ +              page = list_entry(pages->prev, struct page, lru);
+ +              list_del(&page->lru);
+ +
+ +              if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
+ +                      page_cache_release(page);
+ +                      data += PAGE_CACHE_SIZE;
+ +                      bytes_read -= PAGE_CACHE_SIZE;
+ +                      continue;
+ +              }
+ +
+ +              target = kmap_atomic(page, KM_USER0);
+ +
+ +              if (PAGE_CACHE_SIZE > bytes_read) {
+ +                      memcpy(target, data, bytes_read);
+ +                      /* zero the tail end of this partial page */
+ +                      memset(target + bytes_read, 0, PAGE_CACHE_SIZE - bytes_read);
+ +                      bytes_read = 0;
+ +              } else {
+ +                      memcpy(target, data, PAGE_CACHE_SIZE);
+ +                      bytes_read -= PAGE_CACHE_SIZE;
+ +              }
+ +              kunmap_atomic(target, KM_USER0);
+ +
+ +              flush_dcache_page(page);
+ +              SetPageUptodate(page);
+ +              unlock_page(page);
+ +              if (!pagevec_add(plru_pvec, page))
+ +                      __pagevec_lru_add_file(plru_pvec);
+ +              data += PAGE_CACHE_SIZE;
+ +      }
+ +      return;
+ +}
+ +
+ +int novfs_a_writepage(struct page *page, struct writeback_control *wbc)
+ +{
+ +      int retCode = -EFAULT;
+ +      struct inode *inode = page->mapping->host;
+ +      struct inode_data *id = inode->i_private;
+ +      loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT);
+ +      struct novfs_schandle session;
+ +      struct novfs_data_list dlst[2];
+ +      size_t len = PAGE_CACHE_SIZE;
+ +
+ +      session = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +
+ +      page_cache_get(page);
+ +
+ +      pos = ((loff_t) page->index << PAGE_CACHE_SHIFT);
+ +
+ +      /*
+ +       * Leave first dlst entry for reply header.
+ +       */
+ +      dlst[1].page = page;
+ +      dlst[1].offset = NULL;
+ +      dlst[1].len = len;
+ +      dlst[1].rwflag = DLREAD;
+ +
+ +      /*
+ +       * Check size so we don't write pass end of file.
+ +       */
+ +      if ((pos + (loff_t) len) > i_size_read(inode)) {
+ +              len = (size_t) (i_size_read(inode) - pos);
+ +      }
+ +
+ +      retCode = novfs_write_pages(id->FileHandle, dlst, 2, len, pos, session);
+ +      if (!retCode) {
+ +              SetPageUptodate(page);
+ +      }
+ +
+ +      unlock_page(page);
+ +      page_cache_release(page);
+ +
+ +      return (retCode);
+ +}
+ +
+ +int novfs_a_writepages(struct address_space *mapping, struct writeback_control *wbc)
+ +{
+ +      int retCode = 0;
+ +      struct inode *inode = mapping->host;
+ +      struct novfs_schandle session;
+ +      void *fh = NULL;
+ +      struct inode_data *id = NULL;
+ +
+ +      int max_page_lookup = novfs_max_iosize / PAGE_CACHE_SIZE;
+ +
+ +      struct novfs_data_list *dlist, *dlptr;
+ +      struct page **pages;
+ +
+ +      int dlist_idx, i = 0;
+ +      pgoff_t index, next_index = 0;
+ +      loff_t pos = 0;
+ +      size_t tsize;
+ +
+ +      SC_INITIALIZE(session);
+ +      DbgPrint("inode=0x%p mapping=0x%p wbc=0x%p nr_to_write=%d", inode, mapping, wbc, wbc->nr_to_write);
+ +
+ +      if (inode) {
+ +              DbgPrint("Inode=0x%p Ino=%d Id=0x%p", inode, inode->i_ino, inode->i_private);
+ +
+ +              if (NULL != (id = inode->i_private)) {
+ +                      session = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      fh = ((struct inode_data *)inode->i_private)->FileHandle;
+ +              }
+ +      }
+ +
+ +      dlist = kmalloc(sizeof(struct novfs_data_list) * max_page_lookup, GFP_KERNEL);
+ +      pages = kmalloc(sizeof(struct page *) * max_page_lookup, GFP_KERNEL);
+ +
+ +      if (id)
+ +              DbgPrint("inode=0x%p fh=0x%p dlist=0x%p pages=0x%p %s", inode, fh, dlist, pages, id->Name);
+ +      else
+ +              DbgPrint("inode=0x%p fh=0x%p dlist=0x%p pages=0x%p", inode, fh, dlist, pages);
+ +
+ +      if (dlist && pages) {
+ +              struct backing_dev_info *bdi = mapping->backing_dev_info;
+ +              int done = 0;
+ +              int nr_pages = 0;
+ +              int scanned = 0;
+ +
+ +              if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ +                      wbc->encountered_congestion = 1;
+ +                      return 0;
+ +              }
+ +
+ +              if (wbc->sync_mode == WB_SYNC_NONE) {
+ +                      index = mapping->writeback_index;       /* Start from prev offset */
+ +              } else {
+ +                      index = 0;      /* whole-file sweep */
+ +                      scanned = 1;
+ +              }
+ +
+ +              next_index = index;
+ +
+ +              while (!done && (wbc->nr_to_write > 0)) {
+ +                      dlist_idx = 0;
+ +                      dlptr = &dlist[1];
+ +
+ +                      DbgPrint("nr_pages=%d", nr_pages);
+ +                      if (!nr_pages) {
+ +                              memset(pages, 0, sizeof(struct page *) * max_page_lookup);
+ +
+ +                              spin_lock_irq(&mapping->tree_lock);
+ +
+ +                              /*
+ +                               * Need to ask for one less then max_page_lookup or we
+ +                               * will overflow the request buffer.  This also frees
+ +                               * the first entry for the reply buffer.
+ +                               */
+ +                              nr_pages =
+ +                                  radix_tree_gang_lookup_tag(&mapping->page_tree,
+ +                                                             (void **)pages, index, max_page_lookup - 1, PAGECACHE_TAG_DIRTY);
+ +
+ +                              DbgPrint("2; nr_pages=%d\n", nr_pages);
+ +                              /*
+ +                               * Check to see if there are dirty pages and there is a valid
+ +                               * file handle.
+ +                               */
+ +                              if (nr_pages && !fh) {
+ +                                      set_bit(AS_EIO, &mapping->flags);
+ +                                      done = 1;
+ +                                      DbgPrint("set_bit AS_EIO");
+ +                                      break;
+ +                              }
+ +
+ +                              for (i = 0; i < nr_pages; i++) {
+ +                                      page_cache_get(pages[i]);
+ +                              }
+ +
+ +                              spin_unlock_irq(&mapping->tree_lock);
+ +
+ +                              if (nr_pages) {
+ +                                      index = pages[nr_pages - 1]->index + 1;
+ +                                      pos = (loff_t) pages[0]->index << PAGE_CACHE_SHIFT;
+ +                              }
+ +
+ +                              if (!nr_pages) {
+ +                                      if (scanned) {
+ +                                              index = 0;
+ +                                              scanned = 0;
+ +                                              continue;
+ +                                      }
+ +                                      done = 1;
+ +                              } else {
+ +                                      next_index = pages[0]->index;
+ +                                      i = 0;
+ +                              }
+ +                      } else {
+ +                              if (pages[i]) {
+ +                                      pos = (loff_t) pages[i]->index << PAGE_CACHE_SHIFT;
+ +                              }
+ +                      }
+ +
+ +                      for (; i < nr_pages; i++) {
+ +                              struct page *page = pages[i];
+ +
+ +                              /*
+ +                               * At this point we hold neither mapping->tree_lock nor
+ +                               * lock on the page itself: the page may be truncated or
+ +                               * invalidated (changing page->mapping to NULL), or even
+ +                               * swizzled back from swapper_space to tmpfs file
+ +                               * mapping
+ +                               */
+ +
+ +                              DbgPrint
+ +                                  ("novfs_a_writepages: pos=0x%llx index=%d page->index=%d next_index=%d\n",
+ +                                   pos, index, page->index, next_index);
+ +
+ +                              if (page->index != next_index) {
+ +                                      next_index = page->index;
+ +                                      break;
+ +                              }
+ +                              next_index = page->index + 1;
+ +
+ +                              lock_page(page);
+ +
+ +                              if (wbc->sync_mode != WB_SYNC_NONE)
+ +                                      wait_on_page_writeback(page);
+ +
+ +                              if (page->mapping != mapping || PageWriteback(page)
+ +                                  || !clear_page_dirty_for_io(page)) {
+ +                                      unlock_page(page);
+ +                                      continue;
+ +                              }
+ +
+ +                              dlptr[dlist_idx].page = page;
+ +                              dlptr[dlist_idx].offset = NULL;
+ +                              dlptr[dlist_idx].len = PAGE_CACHE_SIZE;
+ +                              dlptr[dlist_idx].rwflag = DLREAD;
+ +                              dlist_idx++;
+ +                              DbgPrint("Add page=0x%p index=0x%lx", page, page->index);
+ +                      }
+ +
+ +                      DbgPrint("dlist_idx=%d", dlist_idx);
+ +                      if (dlist_idx) {
+ +                              tsize = dlist_idx * PAGE_CACHE_SIZE;
+ +                              /*
+ +                               * Check size so we don't write pass end of file.
+ +                               */
+ +                              if ((pos + tsize) > i_size_read(inode)) {
+ +                                      tsize = (size_t) (i_size_read(inode) - pos);
+ +                              }
+ +
+ +                              retCode = novfs_write_pages(fh, dlist, dlist_idx + 1, tsize, pos, session);
+ +                              switch (retCode) {
+ +                              case 0:
+ +                                      wbc->nr_to_write -= dlist_idx;
+ +                                      break;
+ +
+ +                              case -ENOSPC:
+ +                                      set_bit(AS_ENOSPC, &mapping->flags);
+ +                                      done = 1;
+ +                                      break;
+ +
+ +                              default:
+ +                                      set_bit(AS_EIO, &mapping->flags);
+ +                                      done = 1;
+ +                                      break;
+ +                              }
+ +
+ +                              do {
+ +                                      unlock_page((struct page *)
+ +                                                  dlptr[dlist_idx - 1].page);
+ +                                      page_cache_release((struct page *)
+ +                                                         dlptr[dlist_idx - 1].page);
+ +                                      DbgPrint("release page=0x%p index=0x%lx", dlptr[dlist_idx - 1].page, ((struct page *)
+ +                                                                                                            dlptr[dlist_idx -
+ +                                                                                                                  1].page)->
+ +                                               index);
+ +                                      if (!retCode) {
+ +                                              wbc->nr_to_write--;
+ +                                      }
+ +                              } while (--dlist_idx);
+ +                      }
+ +
+ +                      if (i >= nr_pages) {
+ +                              nr_pages = 0;
+ +                      }
+ +              }
+ +
+ +              mapping->writeback_index = index;
+ +
+ +      } else {
+ +              DbgPrint("set_bit AS_EIO");
+ +              set_bit(AS_EIO, &mapping->flags);
+ +      }
+ +      if (dlist)
+ +              kfree(dlist);
+ +      if (pages)
+ +              kfree(pages);
+ +
+ +      DbgPrint("retCode=%d", retCode);
+ +      return (0);
+ +
+ +}
+ +
+ +int novfs_a_readpage(struct file *file, struct page *page)
+ +{
+ +      int retCode = 0;
+ +      void *pbuf;
+ +      struct inode *inode = NULL;
+ +      struct dentry *dentry = NULL;
+ +      loff_t offset;
+ +      size_t len;
+ +      struct novfs_schandle session;
+ +
+ +      SC_INITIALIZE(session);
+ +      DbgPrint("File=0x%p Name=%.*s Page=0x%p", file, file->f_dentry->d_name.len, file->f_dentry->d_name.name, page);
+ +
+ +      dentry = file->f_dentry;
+ +
+ +      if (dentry) {
+ +              DbgPrint("Dentry=0x%p Name=%.*s", dentry, dentry->d_name.len, dentry->d_name.name);
+ +              if (dentry->d_inode) {
+ +                      inode = dentry->d_inode;
+ +              }
+ +      }
+ +
+ +      if (inode) {
+ +              DbgPrint("Inode=0x%p Ino=%d", inode, inode->i_ino);
+ +
+ +              if (inode->i_private) {
+ +                      session = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      if (0 == SC_PRESENT(session)) {
+ +                              ((struct inode_data *)inode->i_private)->Scope = novfs_get_scope(file->f_dentry);
+ +                              session = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      }
+ +              }
+ +      }
+ +
+ +      if (!PageUptodate(page)) {
+ +              struct novfs_data_list dlst[2];
+ +
+ +              offset = page->index << PAGE_CACHE_SHIFT;
+ +              len = PAGE_CACHE_SIZE;
+ +
+ +              /*
+ +               * Save the first entry for the reply header.
+ +               */
+ +              dlst[1].page = page;
+ +              dlst[1].offset = NULL;
+ +              dlst[1].len = PAGE_CACHE_SIZE;
+ +              dlst[1].rwflag = DLWRITE;
+ +
+ +              DbgPrint("calling= novfs_Read_Pages %lld", offset);
+ +              retCode = novfs_read_pages(file->private_data, dlst, 2, &len, &offset, session);
+ +              if (len && (len < PAGE_CACHE_SIZE)) {
+ +                      pbuf = kmap_atomic(page, KM_USER0);
+ +                      memset(&((char *)pbuf)[len], 0, PAGE_CACHE_SIZE - len);
+ +                      kunmap_atomic(pbuf, KM_USER0);
+ +              }
+ +
+ +              flush_dcache_page(page);
+ +              SetPageUptodate(page);
+ +      }
+ +      unlock_page(page);
+ +
+ +      DbgPrint("retCode=%d", retCode);
+ +      return (retCode);
+ +
+ +}
+ +
+ +int novfs_a_readpages(struct file *file, struct address_space *mapping, struct list_head *page_lst, unsigned nr_pages)
+ +{
+ +      int retCode = 0;
+ +      struct inode *inode = NULL;
+ +      struct dentry *dentry = NULL;
+ +      struct novfs_schandle session;
+ +      loff_t offset;
+ +      size_t len;
+ +
+ +      unsigned page_idx;
+ +      struct pagevec lru_pvec;
+ +      pgoff_t next_index;
+ +
+ +      char *rbuf, done = 0;
+ +      SC_INITIALIZE(session);
+ +
+ +      DbgPrint("File=0x%p Name=%.*s Pages=%d", file, file->f_dentry->d_name.len, file->f_dentry->d_name.name, nr_pages);
+ +
+ +      dentry = file->f_dentry;
+ +
+ +      if (dentry) {
+ +              DbgPrint("Dentry=0x%p Name=%.*s", dentry, dentry->d_name.len, dentry->d_name.name);
+ +              if (dentry->d_inode) {
+ +                      inode = dentry->d_inode;
+ +              }
+ +      }
+ +
+ +      if (inode) {
+ +              DbgPrint("Inode=0x%p Ino=%d", inode, inode->i_ino);
+ +
+ +              if (inode->i_private) {
+ +                      session = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      if (0 == SC_PRESENT(session)) {
+ +                              ((struct inode_data *)inode->i_private)->Scope = novfs_get_scope(file->f_dentry);
+ +                              session = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      }
+ +              }
+ +      }
+ +
+ +      rbuf = kmalloc(novfs_max_iosize, GFP_KERNEL);
+ +      if (rbuf) {
+ +              pagevec_init(&lru_pvec, 0);
+ +              for (page_idx = 0; page_idx < nr_pages && !done;) {
+ +                      struct page *page, *tpage;
+ +
+ +                      if (list_empty(page_lst))
+ +                              break;
+ +
+ +                      page = list_entry(page_lst->prev, struct page, lru);
+ +
+ +                      next_index = page->index;
+ +                      offset = (loff_t) page->index << PAGE_CACHE_SHIFT;
+ +                      len = 0;
+ +
+ +                      /*
+ +                       * Count number of contiguous pages.
+ +                       */
+ +                      list_for_each_entry_reverse(tpage, page_lst, lru) {
+ +                              if ((next_index != tpage->index) || (len >= novfs_max_iosize - PAGE_SIZE)) {
+ +                                      break;
+ +                              }
+ +                              len += PAGE_SIZE;
+ +                              next_index++;
+ +                      }
+ +
+ +                      if (len && !done) {
+ +                              struct novfs_data_list dllst[2];
+ +
+ +                              dllst[1].page = NULL;
+ +                              dllst[1].offset = rbuf;
+ +                              dllst[1].len = len;
+ +                              dllst[1].rwflag = DLWRITE;
+ +
+ +                              DbgPrint("calling novfs_Read_Pages %lld", offset);
+ +                              if (!novfs_read_pages(file->private_data, dllst, 2, &len, &offset, session)) {
+ +                                      novfs_copy_cache_pages(mapping, page_lst, len, rbuf, &lru_pvec);
+ +                                      page_idx += len >> PAGE_CACHE_SHIFT;
+ +                                      if ((int)(len & PAGE_CACHE_MASK) != len) {
+ +                                              page_idx++;
+ +                                      }
+ +                                      if (len == 0) {
+ +                                              done = 1;
+ +                                      }
+ +                              } else {
+ +                                      done = 1;
+ +                              }
+ +                      }
+ +              }
+ +
+ +              /*
+ +               * Free any remaining pages.
+ +               */
+ +              while (!list_empty(page_lst)) {
+ +                      struct page *page = list_entry(page_lst->prev, struct page, lru);
+ +
+ +                      list_del(&page->lru);
+ +                      page_cache_release(page);
+ +              }
+ +
+ +              pagevec_lru_add_file(&lru_pvec);
+ +              kfree(rbuf);
+ +      } else {
+ +              retCode = -ENOMEM;
+ +      }
+ +
+ +      DbgPrint("retCode=%d", retCode);
+ +      return (retCode);
+ +
+ +}
+ +
+ +int novfs_a_write_begin(struct file *file, struct address_space *mapping,
+ +                      loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata)
+ +{
+ +      int retVal = 0;
+ +      loff_t offset = pos;
+ +      struct novfs_schandle session;
+ +      struct novfs_data_list dllst[2];
+ +      struct inode *inode = file->f_dentry->d_inode;
+ +      struct page *page;
+ +      pgoff_t index;
+ +      unsigned from, to;
+ +      SC_INITIALIZE(session);
+ +
+ +      index = pos >> PAGE_CACHE_SHIFT;
+ +      from = pos & (PAGE_CACHE_SIZE - 1);
+ +      to = from + len;
+ +
+ +      page = grab_cache_page_write_begin(mapping, index, flags);
+ +      if (!page)
+ +              return -ENOMEM;
+ +
+ +      *pagep = page;
+ +
+ +      DbgPrint("File=0x%p Page=0x%p offset=0x%llx From=%u To=%u "
+ +               "filesize=%lld\n", file, page, offset, from, to, i_size_read(file->f_dentry->d_inode));
+ +      if (!PageUptodate(page)) {
+ +              /*
+ +               * Check to see if whole page
+ +               */
+ +              if ((to == PAGE_CACHE_SIZE) && (from == 0)) {
+ +                      SetPageUptodate(page);
+ +              }
+ +
+ +              /*
+ +               * Check to see if we can read page.
+ +               */
+ +              else if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+ +                      /*
+ +                       * Get session.
+ +                       */
+ +                      if (file->f_dentry && file->f_dentry->d_inode) {
+ +                              if (file->f_dentry->d_inode->i_private) {
+ +                                      session = novfs_scope_get_sessionId(((struct inode_data *)
+ +                                                                           inode->i_private)->Scope);
+ +                                      if (0 == SC_PRESENT(session)) {
+ +                                              ((struct inode_data *)inode->i_private)->Scope = novfs_get_scope(file->f_dentry);
+ +                                              session = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                                      }
+ +                              }
+ +                      }
+ +
+ +                      page_cache_get(page);
+ +
+ +                      len = i_size_read(inode) - offset;
+ +                      if (len > PAGE_CACHE_SIZE) {
+ +                              len = PAGE_CACHE_SIZE;
+ +                      }
+ +
+ +                      if (len) {
+ +                              /*
+ +                               * Read page from server.
+ +                               */
+ +
+ +                              dllst[1].page = page;
+ +                              dllst[1].offset = 0;
+ +                              dllst[1].len = len;
+ +                              dllst[1].rwflag = DLWRITE;
+ +
+ +                              DbgPrint("calling novfs_Read_Pages %lld", offset);
+ +                              novfs_read_pages(file->private_data, dllst, 2, &len, &offset, session);
+ +
+ +                              /*
+ +                               * Zero unnsed page.
+ +                               */
+ +                      }
+ +
+ +                      if (len < PAGE_CACHE_SIZE) {
+ +                              char *adr = kmap_atomic(page, KM_USER0);
+ +                              memset(adr + len, 0, PAGE_CACHE_SIZE - len);
+ +                              kunmap_atomic(adr, KM_USER0);
+ +                      }
+ +              } else {
+ +                      /*
+ +                       * Zero section of memory that not going to be used.
+ +                       */
+ +                      char *adr = kmap_atomic(page, KM_USER0);
+ +                      memset(adr, 0, from);
+ +                      memset(adr + to, 0, PAGE_CACHE_SIZE - to);
+ +                      kunmap_atomic(adr, KM_USER0);
+ +
+ +                      DbgPrint("memset 0x%p", adr);
+ +              }
+ +              flush_dcache_page(page);
+ +              SetPageUptodate(page);
+ +      }
+ +//   DbgPrint("return %d", retVal);
+ +      return (retVal);
+ +}
+ +
+ +int novfs_a_write_end(struct file *file, struct address_space *mapping,
+ +                    loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
+ +{
+ +      int retCode = 0;
+ +      struct inode *inode = page->mapping->host;
+ +      loff_t offset = pos;
+ +      struct novfs_schandle session;
+ +      struct inode_data *id;
+ +      struct novfs_data_list dlst[1];
+ +      pgoff_t index;
+ +      unsigned from, to;
+ +      SC_INITIALIZE(session);
+ +
+ +      index = pos >> PAGE_CACHE_SHIFT;
+ +      from = pos & (PAGE_CACHE_SIZE - 1);
+ +      to = from + len;
+ +
+ +      DbgPrint("File=0x%p Page=0x%p offset=0x%x To=%u filesize=%lld",
+ +               file, page, offset, to, i_size_read(file->f_dentry->d_inode));
+ +      if (file->f_dentry->d_inode && (id = file->f_dentry->d_inode->i_private)) {
+ +              session = novfs_scope_get_sessionId(id->Scope);
+ +              if (0 == SC_PRESENT(session)) {
+ +                      id->Scope = novfs_get_scope(file->f_dentry);
+ +                      session = novfs_scope_get_sessionId(id->Scope);
+ +              }
+ +
+ +              /*
+ +               * Setup file handle
+ +               */
+ +              id->FileHandle = file->private_data;
+ +
+ +              if (pos > inode->i_size) {
+ +                      i_size_write(inode, pos);
+ +              }
+ +
+ +              if (!PageUptodate(page)) {
+ +                      pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset;
+ +
+ +                      if (to < offset) {
+ +                              return (retCode);
+ +                      }
+ +                      dlst[0].page = page;
+ +                      dlst[0].offset = (void *)(unsigned long)offset;
+ +                      dlst[0].len = len;
+ +                      dlst[0].rwflag = DLREAD;
+ +
+ +                      retCode = novfs_write_pages(id->FileHandle, dlst, 1, len, pos, session);
+ +
+ +              } else {
+ +                      set_page_dirty(page);
+ +              }
+ +      }
+ +
+ +      return (retCode);
+ +}
+ +
+ +/*++======================================================================*/
+ +ssize_t novfs_a_direct_IO(int rw, struct kiocb * kiocb, const struct iovec * iov, loff_t offset, unsigned long nr_segs)
+ +/*
+ + *
+ + *  Notes:        This is a dummy function so that we can allow a file
+ + *                to get the direct IO flag set.  novfs_f_read and
+ + *                novfs_f_write will do the work.  Maybe not the best
+ + *                way to do but it was the easiest to implement.
+ + *
+ + *========================================================================*/
+ +{
+ +      return (-EIO);
+ +}
+ +
+ +/*++======================================================================*/
+ +int novfs_i_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+ +{
+ +      char *path, *buf;
+ +      struct novfs_entry_info info;
+ +      void *handle;
+ +      struct novfs_schandle session;
+ +      int retCode = -EACCES;
+ +
+ +      DbgPrint("mode=0%o flags=0%o %.*s", mode, nd->NDOPENFLAGS, dentry->d_name.len, dentry->d_name.name);
+ +
+ +      if (IS_ROOT(dentry) ||  /* Root */
+ +          IS_ROOT(dentry->d_parent) ||        /* User */
+ +          IS_ROOT(dentry->d_parent->d_parent) ||      /* Server */
+ +          IS_ROOT(dentry->d_parent->d_parent->d_parent)) {    /* Volume */
+ +              return (-EACCES);
+ +      }
+ +
+ +      if (mode | S_IFREG) {
+ +              if (dir->i_private) {
+ +                      session = novfs_scope_get_sessionId(((struct inode_data *)dir->i_private)->Scope);
+ +                      if (0 == SC_PRESENT(session)) {
+ +                              ((struct inode_data *)dir->i_private)->Scope = novfs_get_scope(dentry);
+ +                              session = novfs_scope_get_sessionId(((struct inode_data *)dir->i_private)->Scope);
+ +                      }
+ +
+ +                      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +                      if (buf) {
+ +                              path = novfs_dget_path(dentry, buf, PATH_LENGTH_BUFFER);
+ +                              if (path) {
+ +                                      retCode = novfs_open_file(path, nd->NDOPENFLAGS | O_RDWR, &info, &handle, session);
+ +                                      if (!retCode && handle) {
+ +                                              novfs_close_file(handle, session);
+ +                                              if (!novfs_i_mknod(dir, dentry, mode | S_IFREG, 0)) {
+ +                                                      if (dentry->d_inode) {
+ +                                                              ((struct inode_data *)
+ +                                                               dentry->d_inode->i_private)->Flags |= UPDATE_INODE;
+ +                                                      }
+ +                                              }
+ +                                      }
+ +                              }
+ +                              kfree(buf);
+ +                      }
+ +              }
+ +      }
+ +      return (retCode);
+ +}
+ +
+ +void update_inode(struct inode *Inode, struct novfs_entry_info *Info)
+ +{
+ +      static char dbuf[128];
+ +
+ +      DbgPrint("Inode=0x%p I_ino=%d", Inode, Inode->i_ino);
+ +
+ +      DbgPrint("atime=%s", ctime_r(&Info->atime.tv_sec, dbuf));
+ +      DbgPrint("ctime=%s", ctime_r(&Info->ctime.tv_sec, dbuf));
+ +      DbgPrint("mtime=%s %d", ctime_r(&Info->mtime.tv_sec, dbuf), Info->mtime.tv_nsec);
+ +      DbgPrint("size=%lld", Info->size);
+ +      DbgPrint("mode=0%o", Info->mode);
+ +
+ +      if (Inode &&
+ +          ((Inode->i_size != Info->size) ||
+ +           (Inode->i_mtime.tv_sec != Info->mtime.tv_sec) || (Inode->i_mtime.tv_nsec != Info->mtime.tv_nsec))) {
+ +              DbgPrint("calling invalidate_remote_inode sz  %d %d", Inode->i_size, Info->size);
+ +              DbgPrint("calling invalidate_remote_inode sec %d %d", Inode->i_mtime.tv_sec, Info->mtime.tv_sec);
+ +              DbgPrint("calling invalidate_remote_inode ns  %d %d", Inode->i_mtime.tv_nsec, Info->mtime.tv_nsec);
+ +
+ +              if (Inode && Inode->i_mapping) {
+ +                      invalidate_remote_inode(Inode);
+ +              }
+ +      }
+ +
+ +      Inode->i_mode = Info->mode;
+ +      Inode->i_size = Info->size;
+ +      Inode->i_atime = Info->atime;
+ +      Inode->i_ctime = Info->ctime;
+ +      Inode->i_mtime = Info->mtime;
+ +
+ +      if (Inode->i_size && Inode->i_sb->s_blocksize) {
+ +
+ +              /*
+ +               * Filling number of blocks as in NSS filesystem.
+ +               * The s_blocksize is initialized to PAGE_CACHE_SIZE in
+ +               * the super block initialization.
+ +               *
+ +               * Update i_blocks to have the number of 512 blocks
+ +               */
+ +              Inode->i_blocks = (((loff_t) Info->size) + Inode->i_sb->s_blocksize - 1)
+ +                  >> (loff_t) Inode->i_blkbits;
+ +              Inode->i_blocks = Inode->i_blocks << (PAGE_CACHE_SHIFT - 9);
+ +              Inode->i_bytes = Info->size & (Inode->i_sb->s_blocksize - 1);
+ +
+ +              DbgPrint("i_sb->s_blocksize=%d", Inode->i_sb->s_blocksize);
+ +              DbgPrint("i_blkbits=%d", Inode->i_blkbits);
+ +              DbgPrint("i_blocks=%d", Inode->i_blocks);
+ +              DbgPrint("i_bytes=%d", Inode->i_bytes);
+ +      }
+ +}
+ +
+ +struct dentry *novfs_i_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+ +{
+ +      struct dentry *retVal = ERR_PTR(-ENOENT);
+ +      struct dentry *parent;
+ +      struct novfs_entry_info *info = NULL;
+ +      struct inode_data *id;
+ +      struct inode *inode = NULL;
+ +      uid_t uid = current_euid();
+ +      ino_t ino = 0;
+ +      struct qstr name;
+ +      char *buf;
+ +
+ +      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +      if (buf) {
+ +              char *path;
+ +              path = novfs_dget_path(dentry, buf, PATH_LENGTH_BUFFER);
+ +              if (path) {
+ +                      DbgPrint("dir 0x%p %d hash %d inode 0x%0p %s", dir, dir->i_ino, dentry->d_name.hash, dentry->d_inode, path);
+ +              }
+ +              kfree(buf);
+ +      } else {
+ +              DbgPrint("dir 0x%p %d name %.*s hash %d inode 0x%0p",
+ +                       dir, dir->i_ino, dentry->d_name.len, dentry->d_name.name, dentry->d_name.hash, dentry->d_inode);
+ +      }
+ +
+ +      if ((dentry->d_name.len == 7)
+ +          && (0 == strncmp(dentry->d_name.name, " !xover", 7))) {
+ +              dentry->d_op = &novfs_dentry_operations;
+ +              igrab(dir);
+ +              d_add(dentry, dir);
+ +              return NULL;
+ +      }
+ +      if ((dentry->d_name.len == 7)
+ +          && (0 == strncmp(dentry->d_name.name, "z!xover", 7))) {
+ +              dentry->d_op = &novfs_dentry_operations;
+ +              igrab(dir);
+ +              d_add(dentry, dir);
+ +              return NULL;
+ +      }
+ +
+ +      if (dir && (id = dir->i_private)) {
+ +              retVal = 0;
+ +              if (IS_ROOT(dentry)) {
+ +                      DbgPrint("Root entry=0x%p", novfs_root);
+ +                      inode = novfs_root->d_inode;
+ +                      return (0);
+ +              } else {
+ +                      info = kmalloc(sizeof(struct novfs_entry_info) + PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +                      if (info) {
+ +                              if (NULL == (retVal = ERR_PTR(verify_dentry(dentry, 1)))) {
+ +                                      name.name = dentry->d_name.name;
+ +                                      name.len = dentry->d_name.len;
+ +                                      name.hash = novfs_internal_hash(&name);
+ +
+ +                                      if (novfs_lock_inode_cache(dir)) {
+ +                                              if (!novfs_get_entry(dir, &name, &ino, info)) {
+ +                                                      inode = ilookup(dentry->d_sb, ino);
+ +                                                      if (inode) {
+ +                                                              update_inode(inode, info);
+ +                                                      }
+ +                                              }
+ +                                              novfs_unlock_inode_cache(dir);
+ +                                      }
+ +
+ +                                      if (!inode && ino) {
+ +                                              if (id && id->Scope) {
+ +                                                      uid = novfs_scope_get_uid(id->Scope);
+ +                                              } else {
+ +                                                      uid = novfs_scope_get_uid(novfs_get_scope(dentry));
+ +                                              }
+ +                                              if (novfs_lock_inode_cache(dir)) {
+ +                                                      inode = novfs_get_inode(dentry->d_sb, info->mode, 0, uid, ino, &name);
+ +                                                      if (inode) {
+ +                                                              if (!novfs_get_entry(dir, &dentry->d_name, &ino, info)) {
+ +                                                                      update_inode(inode, info);
+ +                                                              }
+ +                                                      }
+ +                                                      novfs_unlock_inode_cache(dir);
+ +                                              }
+ +                                      }
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      if (!retVal) {
+ +              dentry->d_op = &novfs_dentry_operations;
+ +              if (inode) {
+ +                      parent = dget_parent(dentry);
+ +                      novfs_d_add(dentry->d_parent, dentry, inode, 1);
+ +                      dput(parent);
+ +              } else {
+ +                      d_add(dentry, inode);
+ +              }
+ +      }
+ +
+ +      if (info)
+ +              kfree(info);
+ +
+ +      DbgPrint("inode=0x%p dentry->d_inode=0x%p return=0x%p", dir, dentry->d_inode, retVal);
+ +
+ +      return (retVal);
+ +}
+ +
+ +int novfs_i_unlink(struct inode *dir, struct dentry *dentry)
+ +{
+ +      int retCode = -ENOENT;
+ +      struct inode *inode;
+ +      struct novfs_schandle session;
+ +      char *path, *buf;
+ +      uint64_t t64;
+ +
+ +      DbgPrint("dir=0x%p dir->i_ino=%d %.*s", dir, dir->i_ino, dentry->d_name.len, dentry->d_name.name);
+ +      DbgPrint("IS_ROOT(dentry)=%d", IS_ROOT(dentry));
+ +      DbgPrint("IS_ROOT(dentry->d_parent)=%d", IS_ROOT(dentry->d_parent));
+ +      DbgPrint("IS_ROOT(dentry->d_parent->d_parent)=%d", IS_ROOT(dentry->d_parent->d_parent));
+ +      DbgPrint("IS_ROOT(dentry->d_parent->d_parent->d_parent)=%d", IS_ROOT(dentry->d_parent->d_parent->d_parent));
+ +
+ +      if (IS_ROOT(dentry) ||  /* Root */
+ +          IS_ROOT(dentry->d_parent) ||        /* User */
+ +          (!IS_ROOT(dentry->d_parent->d_parent) &&    /* Server */
+ +           IS_ROOT(dentry->d_parent->d_parent->d_parent))) {  /* Volume */
+ +              return (-EACCES);
+ +      }
+ +
+ +      inode = dentry->d_inode;
+ +      if (inode) {
+ +              DbgPrint("dir=0x%p dir->i_ino=%d inode=0x%p ino=%d", dir, dir->i_ino, inode, inode->i_ino);
+ +              if (inode->i_private) {
+ +                      session = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      if (0 == SC_PRESENT(session)) {
+ +                              ((struct inode_data *)inode->i_private)->Scope = novfs_get_scope(dentry);
+ +                              session = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      }
+ +
+ +                      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +                      if (buf) {
+ +                              path = novfs_dget_path(dentry, buf, PATH_LENGTH_BUFFER);
+ +                              if (path) {
+ +                                      DbgPrint("path %s mode 0%o", path, inode->i_mode);
+ +                                      if (IS_ROOT(dentry->d_parent->d_parent)) {
+ +                                              retCode = novfs_daemon_logout(&dentry->d_name, &session);
+ +                                      } else {
+ +                                              retCode = novfs_delete(path, S_ISDIR(inode->i_mode), session);
+ +                                              if (retCode) {
+ +                                                      struct iattr ia;
+ +                                                      memset(&ia, 0, sizeof(ia));
+ +                                                      ia.ia_valid = ATTR_MODE;
+ +                                                      ia.ia_mode = S_IRWXU;
+ +                                                      novfs_set_attr(path, &ia, session);
+ +                                                      retCode = novfs_delete(path, S_ISDIR(inode->i_mode), session);
+ +                                              }
+ +                                      }
+ +                                      if (!retCode || IS_DEADDIR(inode)) {
+ +                                              novfs_remove_inode_entry(dir, &dentry->d_name, 0);
+ +                                              dentry->d_time = 0;
+ +                                              t64 = 0;
+ +                                              novfs_scope_set_userspace(&t64, &t64, &t64, &t64);
+ +                                              retCode = 0;
+ +                                      }
+ +                              }
+ +                              kfree(buf);
+ +                      }
+ +              }
+ +      }
+ +
+ +      DbgPrint("retCode 0x%x", retCode);
+ +      return (retCode);
+ +}
+ +
+ +int novfs_i_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+ +{
+ +      char *path, *buf;
+ +      struct novfs_schandle session;
+ +      int retCode = 0;
+ +      struct inode *inode;
+ +      struct novfs_entry_info info;
+ +      uid_t uid;
+ +
+ +      DbgPrint("dir=0x%p ino=%d dentry=0x%p %.*s mode=0%lo",
+ +               dir, dir->i_ino, dentry, dentry->d_name.len, dentry->d_name.name, mode);
+ +
+ +      if (IS_ROOT(dentry) ||  /* Root */
+ +          IS_ROOT(dentry->d_parent) ||        /* User */
+ +          IS_ROOT(dentry->d_parent->d_parent) ||      /* Server */
+ +          IS_ROOT(dentry->d_parent->d_parent->d_parent)) {    /* Volume */
+ +              return (-EACCES);
+ +      }
+ +
+ +      mode |= S_IFDIR;
+ +      mode &= (S_IFMT | S_IRWXU);
+ +      if (dir->i_private) {
+ +              session = novfs_scope_get_sessionId(((struct inode_data *)dir->i_private)->Scope);
+ +              if (0 == SC_PRESENT(session)) {
+ +                      ((struct inode_data *)dir->i_private)->Scope = novfs_get_scope(dentry);
+ +                      session = novfs_scope_get_sessionId(((struct inode_data *)dir->i_private)->Scope);
+ +              }
+ +
+ +              uid = novfs_scope_get_uid(((struct inode_data *)dir->i_private)->Scope);
+ +              buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +              if (buf) {
+ +                      path = novfs_dget_path(dentry, buf, PATH_LENGTH_BUFFER);
+ +                      if (path) {
+ +                              DbgPrint("path %s", path);
+ +                              retCode = novfs_create(path, S_ISDIR(mode), session);
+ +                              if (!retCode) {
+ +                                      retCode = novfs_get_file_info(path, &info, session);
+ +                                      if (!retCode) {
+ +                                              retCode = novfs_i_mknod(dir, dentry, mode, 0);
+ +                                              inode = dentry->d_inode;
+ +                                              if (inode) {
+ +                                                      update_inode(inode, &info);
+ +                                                      ((struct inode_data *)inode->i_private)->Flags &= ~UPDATE_INODE;
+ +
+ +                                                      dentry->d_time = jiffies + (novfs_update_timeout * HZ);
+ +
+ +                                                      novfs_lock_inode_cache(dir);
+ +                                                      if (novfs_update_entry(dir, &dentry->d_name, 0, &info)) {
+ +                                                              novfs_add_inode_entry(dir, &dentry->d_name, inode->i_ino, &info);
+ +                                                      }
+ +                                                      novfs_unlock_inode_cache(dir);
+ +                                              }
+ +
+ +                                      }
+ +                              }
+ +                      }
+ +                      kfree(buf);
+ +              }
+ +      }
+ +
+ +      return (retCode);
+ +}
+ +
+ +int novfs_i_rmdir(struct inode *inode, struct dentry *dentry)
+ +{
+ +      return (novfs_i_unlink(inode, dentry));
+ +}
+ +
+ +int novfs_i_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+ +{
+ +      struct inode *inode = NULL;
+ +      int retCode = -EACCES;
+ +      uid_t uid;
+ +      struct dentry *parent;
+ +
+ +      if (IS_ROOT(dentry) ||  /* Root */
+ +          IS_ROOT(dentry->d_parent) ||        /* User */
+ +          IS_ROOT(dentry->d_parent->d_parent) ||      /* Server */
+ +          IS_ROOT(dentry->d_parent->d_parent->d_parent)) {    /* Volume */
+ +              return (-EACCES);
+ +      }
+ +
+ +      if (((struct inode_data *)dir->i_private)) {
+ +              uid = novfs_scope_get_uid(((struct inode_data *)dir->i_private)->Scope);
+ +              if (mode & (S_IFREG | S_IFDIR)) {
+ +                      inode = novfs_get_inode(dir->i_sb, mode, dev, uid, 0, &dentry->d_name);
+ +              }
+ +      }
+ +      if (inode) {
+ +              struct novfs_entry_info info;
+ +
+ +              dentry->d_op = &novfs_dentry_operations;
+ +              parent = dget_parent(dentry);
+ +              novfs_d_add(parent, dentry, inode, 0);
+ +              memset(&info, 0, sizeof(info));
+ +              info.mode = inode->i_mode;
+ +              novfs_lock_inode_cache(dir);
+ +              novfs_add_inode_entry(dir, &dentry->d_name, inode->i_ino, &info);
+ +              novfs_unlock_inode_cache(dir);
+ +
+ +              dput(parent);
+ +
+ +              retCode = 0;
+ +      }
+ +      DbgPrint("return 0x%x", retCode);
+ +      return retCode;
+ +}
+ +
+ +int novfs_i_rename(struct inode *odir, struct dentry *od, struct inode *ndir, struct dentry *nd)
+ +{
+ +      int retCode = -ENOTEMPTY;
+ +      char *newpath, *newbuf, *newcon;
+ +      char *oldpath, *oldbuf, *oldcon;
+ +      struct qstr oldname;
+ +      struct novfs_entry_info *info = NULL;
+ +      int oldlen, newlen;
+ +      struct novfs_schandle session;
+ +      ino_t ino;
+ +
+ +      if (IS_ROOT(od) ||      /* Root */
+ +          IS_ROOT(od->d_parent) ||    /* User */
+ +          IS_ROOT(od->d_parent->d_parent) ||  /* Server */
+ +          IS_ROOT(od->d_parent->d_parent->d_parent)) {        /* Volume */
+ +              return (-EACCES);
+ +      }
+ +
+ +      DbgPrint("odir=0x%p ino=%d ndir=0x%p ino=%d", odir, odir->i_ino, ndir, ndir->i_ino);
+ +
+ +      oldbuf = kmalloc(PATH_LENGTH_BUFFER * 2, GFP_KERNEL);
+ +      newbuf = oldbuf + PATH_LENGTH_BUFFER;
+ +      if (oldbuf && newbuf) {
+ +              oldpath = novfs_dget_path(od, oldbuf, PATH_LENGTH_BUFFER);
+ +              newpath = novfs_dget_path(nd, newbuf, PATH_LENGTH_BUFFER);
+ +              if (oldpath && newpath) {
+ +                      oldlen = PATH_LENGTH_BUFFER - (int)(oldpath - oldbuf);
+ +                      newlen = PATH_LENGTH_BUFFER - (int)(newpath - newbuf);
+ +
+ +                      DbgPrint("od=0x%p od->inode=0x%p od->inode->i_ino=%d %s", od, od->d_inode, od->d_inode->i_ino, oldpath);
+ +                      if (nd->d_inode) {
+ +                              DbgPrint("nd=0x%p nd->inode=0x%p nd->inode->i_ino=%d %s",
+ +                                       nd, nd->d_inode, nd->d_inode->i_ino, newpath);
+ +                      } else {
+ +                              DbgPrint("nd=0x%p nd->inode=0x%p %s", nd, nd->d_inode, newpath);
+ +                      }
+ +
+ +                      /*
+ +                       * Check to see if two different servers or different volumes
+ +                       */
+ +                      newcon = strchr(newpath + 1, '\\');
+ +                      oldcon = strchr(oldpath + 1, '\\');
+ +                      DbgPrint("newcon=0x%p newpath=0x%p", newcon, newpath);
+ +                      DbgPrint("oldcon=0x%p oldpath=0x%p", oldcon, oldpath);
+ +                      retCode = -EXDEV;
+ +                      if (newcon && oldcon && ((int)(newcon - newpath) == (int)(oldcon - oldpath))) {
+ +                              newcon = strchr(newcon + 1, '\\');
+ +                              oldcon = strchr(oldcon + 1, '\\');
+ +                              DbgPrint("2; newcon=0x%p newpath=0x%p", newcon, newpath);
+ +                              DbgPrint("2; oldcon=0x%p oldpath=0x%p", oldcon, oldpath);
+ +                              if (newcon && oldcon && ((int)(newcon - newpath) == (int)(oldcon - oldpath))) {
+ +                                      oldname.name = oldpath;
+ +                                      oldname.len = (int)(oldcon - oldpath);
+ +                                      oldname.hash = 0;
+ +                                      if (!novfs_d_strcmp(newpath,
+ +                                                  newcon - newpath,
+ +                                                  &oldname)) {
+ +
+ +                                              if (od->d_inode && od->d_inode->i_private) {
+ +
+ +                                                      if (nd->d_inode && nd->d_inode->i_private) {
+ +                                                              session =
+ +                                                                  novfs_scope_get_sessionId
+ +                                                                  (((struct inode_data *)ndir->i_private)->Scope);
+ +                                                              if (0 == SC_PRESENT(session)) {
+ +                                                                      ((struct inode_data *)ndir->i_private)->Scope =
+ +                                                                          novfs_get_scope(nd);
+ +                                                                      session =
+ +                                                                          novfs_scope_get_sessionId(((struct inode_data *)ndir->
+ +                                                                                                     i_private)->Scope);
+ +                                                              }
+ +
+ +                                                              retCode =
+ +                                                                  novfs_delete(newpath, S_ISDIR(nd->d_inode->i_mode), session);
+ +                                                              if (retCode) {
+ +                                                                      struct iattr ia;
+ +                                                                      memset(&ia, 0, sizeof(ia));
+ +                                                                      ia.ia_valid = ATTR_MODE;
+ +                                                                      ia.ia_mode = S_IRWXU;
+ +                                                                      novfs_set_attr(newpath, &ia, session);
+ +                                                                      retCode =
+ +                                                                          novfs_delete(newpath, S_ISDIR(nd->d_inode->i_mode),
+ +                                                                                       session);
+ +                                                              }
+ +
+ +                                                      }
+ +
+ +                                                      session =
+ +                                                          novfs_scope_get_sessionId(((struct inode_data *)ndir->i_private)->
+ +                                                                                    Scope);
+ +                                                      if (0 == SC_PRESENT(session)) {
+ +                                                              ((struct inode_data *)ndir->i_private)->Scope = novfs_get_scope(nd);
+ +                                                              session =
+ +                                                                  novfs_scope_get_sessionId(((struct inode_data *)ndir->
+ +                                                                                             i_private)->Scope);
+ +                                                      }
+ +                                                      retCode =
+ +                                                          novfs_rename_file(S_ISDIR(od->d_inode->i_mode), oldpath, oldlen - 1,
+ +                                                                            newpath, newlen - 1, session);
+ +
+ +                                                      if (!retCode) {
+ +                                                              info = (struct novfs_entry_info *)oldbuf;
+ +                                                              od->d_time = 0;
+ +                                                              novfs_remove_inode_entry(odir, &od->d_name, 0);
+ +                                                              novfs_remove_inode_entry(ndir, &nd->d_name, 0);
+ +                                                              novfs_get_file_info(newpath, info, session);
+ +                                                              nd->d_time = jiffies + (novfs_update_timeout * HZ);
+ +
+ +                                                              if (od->d_inode && od->d_inode->i_ino) {
+ +                                                                      ino = od->d_inode->i_ino;
+ +                                                              } else {
+ +                                                                      ino = (ino_t) atomic_inc_return(&novfs_Inode_Number);
+ +                                                              }
+ +                                                              novfs_add_inode_entry(ndir, &nd->d_name, ino, info);
+ +                                                      }
+ +                                              }
+ +                                      }
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      if (oldbuf)
+ +              kfree(oldbuf);
+ +
+ +      DbgPrint("return %d", retCode);
+ +      return (retCode);
+ +}
+ +
+ +int novfs_i_setattr(struct dentry *dentry, struct iattr *attr)
+ +{
+ +      char *path, *buf;
+ +      struct inode *inode = dentry->d_inode;
+ +      char atime_buf[32];
+ +      char mtime_buf[32];
+ +      char ctime_buf[32];
+ +      unsigned int ia_valid = attr->ia_valid;
+ +      struct novfs_schandle session;
+ +      int retVal = 0;
+ +
+ +      if (IS_ROOT(dentry) ||  /* Root */
+ +          IS_ROOT(dentry->d_parent) ||        /* User */
+ +          IS_ROOT(dentry->d_parent->d_parent) ||      /* Server */
+ +          IS_ROOT(dentry->d_parent->d_parent->d_parent)) {    /* Volume */
+ +              return (-EACCES);
+ +      }
+ +
+ +      if (inode && inode->i_private) {
+ +              session = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +              if (0 == SC_PRESENT(session)) {
+ +                      ((struct inode_data *)inode->i_private)->Scope = novfs_get_scope(dentry);
+ +                      session = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +              }
+ +
+ +              buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +              if (buf) {
+ +                      path = novfs_dget_path(dentry, buf, PATH_LENGTH_BUFFER);
+ +                      if (path) {
+ +                              strcpy(atime_buf, "Unspecified");
+ +                              strcpy(mtime_buf, "Unspecified");
+ +                              strcpy(ctime_buf, "Unspecified");
+ +                              if (attr->ia_valid & ATTR_ATIME) {
+ +                                      ctime_r(&attr->ia_atime.tv_sec, atime_buf);
+ +                              }
+ +                              if (attr->ia_valid & ATTR_MTIME) {
+ +                                      ctime_r(&attr->ia_mtime.tv_sec, mtime_buf);
+ +                              }
+ +                              if (attr->ia_valid & ATTR_CTIME) {
+ +                                      ctime_r(&attr->ia_ctime.tv_sec, ctime_buf);
+ +                              }
+ +                              /* Removed for Bug 132374. jlt */
+ +                              __DbgPrint("%s: %s\n"
+ +                                         "   ia_valid:      0x%x\n"
+ +                                         "   ia_mode:       0%o\n"
+ +                                         "   ia_uid:        %d\n"
+ +                                         "   ia_gid:        %d\n"
+ +                                         "   ia_size:       %lld\n"
+ +                                         "   ia_atime:      %s\n"
+ +                                         "   ia_mtime:      %s\n"
+ +                                         "   ia_ctime:      %s\n", __func__,
+ +                                         path,
+ +                                         attr->ia_valid,
+ +                                         attr->ia_mode,
+ +                                         attr->ia_uid, attr->ia_gid, attr->ia_size, atime_buf, mtime_buf, ctime_buf);
+ +
+ +                              if (ia_valid && !(retVal = novfs_set_attr(path, attr, session))) {
+ +                                      ((struct inode_data *)inode->i_private)->Flags |= UPDATE_INODE;
+ +
+ +                                      if (ia_valid & ATTR_ATIME)
+ +                                              inode->i_atime = attr->ia_atime;
+ +                                      if (ia_valid & ATTR_MTIME)
+ +                                              inode->i_mtime = attr->ia_mtime;
+ +                                      if (ia_valid & ATTR_CTIME)
+ +                                              inode->i_ctime = attr->ia_ctime;
+ +                                      if (ia_valid & ATTR_MODE) {
+ +                                              inode->i_mode = attr->ia_mode & (S_IFMT | S_IRWXU);
+ +                                      }
+ +                              }
+ +                      }
+ +              }
+ +              kfree(buf);
+ +      }
+ +      DbgPrint("return 0x%x", retVal);
+ +
+ +      return (retVal);
+ +}
+ +
+ +int novfs_i_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *kstat)
+ +{
+ +      int retCode = 0;
+ +      char atime_buf[32];
+ +      char mtime_buf[32];
+ +      char ctime_buf[32];
+ +      struct inode *inode = dentry->d_inode;
+ +
+ +      struct novfs_entry_info info;
+ +      char *path, *buf;
+ +      struct novfs_schandle session;
+ +      struct inode_data *id;
+ +
+ +      if (!IS_ROOT(dentry) && !IS_ROOT(dentry->d_parent)) {
+ +              SC_INITIALIZE(session);
+ +              id = dentry->d_inode->i_private;
+ +
+ +              if (id && (id->Flags & UPDATE_INODE)) {
+ +                      session = novfs_scope_get_sessionId(id->Scope);
+ +
+ +                      if (0 == SC_PRESENT(session)) {
+ +                              id->Scope = novfs_get_scope(dentry);
+ +                              session = novfs_scope_get_sessionId(id->Scope);
+ +                      }
+ +
+ +                      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +                      if (buf) {
+ +                              path = novfs_dget_path(dentry, buf, PATH_LENGTH_BUFFER);
+ +                              if (path) {
+ +                                      retCode = novfs_get_file_info(path, &info, session);
+ +                                      if (!retCode) {
+ +                                              update_inode(inode, &info);
+ +                                              id->Flags &= ~UPDATE_INODE;
+ +                                      }
+ +                              }
+ +                              kfree(buf);
+ +                      }
+ +              }
+ +      }
+ +
+ +      kstat->ino = inode->i_ino;
+ +      kstat->dev = inode->i_sb->s_dev;
+ +      kstat->mode = inode->i_mode;
+ +      kstat->nlink = inode->i_nlink;
+ +      kstat->uid = inode->i_uid;
+ +      kstat->gid = inode->i_gid;
+ +      kstat->rdev = inode->i_rdev;
+ +      kstat->size = i_size_read(inode);
+ +      kstat->atime = inode->i_atime;
+ +      kstat->mtime = inode->i_mtime;
+ +      kstat->ctime = inode->i_ctime;
+ +      kstat->blksize = inode->i_sb->s_blocksize;
+ +      kstat->blocks = inode->i_blocks;
+ +      if (inode->i_bytes) {
+ +              kstat->blocks++;
+ +      }
+ +      ctime_r(&kstat->atime.tv_sec, atime_buf);
+ +      ctime_r(&kstat->mtime.tv_sec, mtime_buf);
+ +      ctime_r(&kstat->ctime.tv_sec, ctime_buf);
+ +
+ +      __DbgPrint("%s: 0x%x 0x%p <%.*s>\n"
+ +                 "   ino: %d\n"
+ +                 "   dev: 0x%x\n"
+ +                 "   mode: 0%o\n"
+ +                 "   nlink: 0x%x\n"
+ +                 "   uid: 0x%x\n"
+ +                 "   gid: 0x%x\n"
+ +                 "   rdev: 0x%x\n"
+ +                 "   size: 0x%llx\n"
+ +                 "   atime: %s\n"
+ +                 "   mtime: %s\n"
+ +                 "   ctime: %s\n"
+ +                 "   blksize: 0x%x\n"
+ +                 "   blocks: 0x%x\n", __func__,
+ +                 retCode, dentry, dentry->d_name.len, dentry->d_name.name,
+ +                 kstat->ino,
+ +                 kstat->dev,
+ +                 kstat->mode,
+ +                 kstat->nlink,
+ +                 kstat->uid,
+ +                 kstat->gid, kstat->rdev, kstat->size, atime_buf, mtime_buf, ctime_buf, kstat->blksize, kstat->blocks);
+ +      return (retCode);
+ +}
+ +
+ +ssize_t novfs_i_getxattr(struct dentry * dentry, const char *name, void *buffer, size_t buffer_size)
+ +{
+ +      struct inode *inode = dentry->d_inode;
+ +      struct novfs_schandle sessionId;
+ +      char *path, *buf, *bufRead;
+ +      ssize_t dataLen;
+ +
+ +      int retxcode = 0;
+ +
+ +      SC_INITIALIZE(sessionId);
+ +
+ +      DbgPrint("Ian");        /*%.*s\n", dentry->d_name.len, dentry->d_name.name); */
+ +      DbgPrint("dentry->d_name.len %u, dentry->d_name.name %s", dentry->d_name.len, dentry->d_name.name);
+ +      DbgPrint("name %s", name);
+ +      DbgPrint("size %u", buffer_size);
+ +
+ +      if (inode && inode->i_private) {
+ +              sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +              DbgPrint("SessionId = %u", sessionId);
+ +              //if (0 == sessionId)
+ +              if (0 == SC_PRESENT(sessionId)) {
+ +                      ((struct inode_data *)inode->i_private)->Scope = novfs_get_scope(dentry);
+ +                      sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      DbgPrint("SessionId = %u", sessionId);
+ +              }
+ +      }
+ +
+ +      dataLen = 0;
+ +      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +      if (buf) {
+ +              path = novfs_dget_path(dentry, buf, PATH_LENGTH_BUFFER);
+ +              if (path) {
+ +                      bufRead = kmalloc(XA_BUFFER, GFP_KERNEL);
+ +                      if (bufRead) {
+ +                              retxcode = novfs_getx_file_info(path, name, bufRead, XA_BUFFER, &dataLen, sessionId);
+ +                              DbgPrint("after novfs_GetX_File_Info retxcode = %d", retxcode);
+ +                              if (!retxcode) {
+ +                                      novfs_dump(64, bufRead);
+ +                                      if (buffer_size != 0) {
+ +                                              if (buffer_size >= dataLen) {
+ +                                                      memcpy(buffer, bufRead, dataLen);
+ +                                              } else {
+ +                                                      DbgPrint("(!!!) not enough buffer_size. buffer_size = %d, dataLen = %d",
+ +                                                               buffer_size, dataLen);
+ +                                                      retxcode = -ERANGE;
+ +                                              }
+ +                                      }
+ +                              }
+ +                              kfree(bufRead);
+ +                      }
+ +              }
+ +              kfree(buf);
+ +      }
+ +
+ +      if (retxcode) {
+ +              dataLen = retxcode;
+ +      } else {
+ +              if ((buffer_size > 0) && (buffer_size < dataLen)) {
+ +                      dataLen = -ERANGE;
+ +              }
+ +      }
+ +
+ +      return (dataLen);
+ +}
+ +
+ +int novfs_i_setxattr(struct dentry *dentry, const char *name, const void *value, size_t value_size, int flags)
+ +{
+ +
+ +      struct inode *inode = dentry->d_inode;
+ +      struct novfs_schandle sessionId;
+ +      char *path, *buf;
+ +      unsigned long bytesWritten = 0;
+ +      int retError = 0;
+ +      int retxcode = 0;
+ +
+ +      SC_INITIALIZE(sessionId);
+ +
+ +      DbgPrint("Ian");        /*%.*s\n", dentry->d_name.len, dentry->d_name.name); */
+ +      DbgPrint("dentry->d_name.len %u, dentry->d_name.name %s", dentry->d_name.len, dentry->d_name.name);
+ +      DbgPrint("name %s", name);
+ +      DbgPrint("value_size %u", value_size);
+ +      DbgPrint("flags %d", flags);
+ +
+ +      if (inode && inode->i_private) {
+ +              sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +              DbgPrint("SessionId = %u", sessionId);
+ +              //if (0 == sessionId)
+ +              if (0 == SC_PRESENT(sessionId)) {
+ +                      ((struct inode_data *)inode->i_private)->Scope = novfs_get_scope(dentry);
+ +                      sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      DbgPrint("SessionId = %u", sessionId);
+ +              }
+ +      }
+ +
+ +      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +      if (buf) {
+ +              path = novfs_dget_path(dentry, buf, PATH_LENGTH_BUFFER);
+ +              if (path) {
+ +                      retxcode = novfs_setx_file_info(path, name, value, value_size, &bytesWritten, flags, sessionId);
+ +                      if (!retxcode) {
+ +                              DbgPrint("bytesWritten = %u", bytesWritten);
+ +                      }
+ +              }
+ +              kfree(buf);
+ +      }
+ +
+ +      if (retxcode) {
+ +              retError = retxcode;
+ +      }
+ +
+ +      if (bytesWritten < value_size) {
+ +              retError = retxcode;
+ +      }
+ +      return (retError);
+ +}
+ +
+ +ssize_t novfs_i_listxattr(struct dentry * dentry, char *buffer, size_t buffer_size)
+ +{
+ +      struct inode *inode = dentry->d_inode;
+ +      struct novfs_schandle sessionId;
+ +      char *path, *buf, *bufList;
+ +      ssize_t dataLen;
+ +      int retxcode = 0;
+ +
+ +      SC_INITIALIZE(sessionId);
+ +
+ +      DbgPrint("Ian");        //%.*s\n", dentry->d_name.len, dentry->d_name.name);
+ +      DbgPrint("dentry->d_name.len %u, dentry->d_name.name %s", dentry->d_name.len, dentry->d_name.name);
+ +      DbgPrint("size %u", buffer_size);
+ +
+ +      if (inode && inode->i_private) {
+ +              sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +              DbgPrint("SessionId = %u", sessionId);
+ +              //if (0 == sessionId)
+ +              if (0 == SC_PRESENT(sessionId)) {
+ +                      ((struct inode_data *)inode->i_private)->Scope = novfs_get_scope(dentry);
+ +                      sessionId = novfs_scope_get_sessionId(((struct inode_data *)inode->i_private)->Scope);
+ +                      DbgPrint("SessionId = %u", sessionId);
+ +              }
+ +      }
+ +
+ +      dataLen = 0;
+ +      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +      if (buf) {
+ +              path = novfs_dget_path(dentry, buf, PATH_LENGTH_BUFFER);
+ +              if (path) {
+ +                      bufList = kmalloc(XA_BUFFER, GFP_KERNEL);
+ +                      if (bufList) {
+ +                              retxcode = novfs_listx_file_info(path, bufList, XA_BUFFER, &dataLen, sessionId);
+ +
+ +                              novfs_dump(64, bufList);
+ +                              if (buffer_size != 0) {
+ +                                      if (buffer_size >= dataLen) {
+ +                                              memcpy(buffer, bufList, dataLen);
+ +                                      } else {
+ +                                              DbgPrint("(!!!) not enough buffer_size. buffer_size = %d, dataLen = %d",
+ +                                                       buffer_size, dataLen);
+ +                                              retxcode = -1;
+ +                                      }
+ +                              }
+ +
+ +                              if (bufList) {
+ +                                      kfree(bufList);
+ +                              }
+ +                      }
+ +
+ +              }
+ +              kfree(buf);
+ +      }
+ +
+ +      if (retxcode) {
+ +              dataLen = -1;
+ +      } else {
+ +
+ +              if ((buffer_size > 0) && (buffer_size < dataLen)) {
+ +                      dataLen = -ERANGE;
+ +              }
+ +      }
+ +      return (dataLen);
+ +}
+ +
+ +int novfs_i_revalidate(struct dentry *dentry)
+ +{
+ +
+ +      DbgPrint("name %.*s", dentry->d_name.len, dentry->d_name.name);
+ +
+ +      return (0);
+ +}
+ +
+ +void novfs_read_inode(struct inode *inode)
+ +{
+ +      DbgPrint("0x%p %d", inode, inode->i_ino);
+ +}
+ +
+ +void novfs_write_inode(struct inode *inode)
+ +{
+ +      DbgPrint("Inode=0x%p Ino=%d", inode, inode->i_ino);
+ +}
+ +
+ +int novfs_notify_change(struct dentry *dentry, struct iattr *attr)
+ +{
+ +      struct inode *inode = dentry->d_inode;
+ +
+ +      DbgPrint("Dentry=0x%p Name=%.*s Inode=0x%p Ino=%d ia_valid=0x%x",
+ +               dentry, dentry->d_name.len, dentry->d_name.name, inode, inode->i_ino, attr->ia_valid);
+ +      return (0);
+ +}
+ +
+ +void novfs_evict_inode(struct inode *inode)
+ +{
+ +      truncate_inode_pages(&inode->i_data, 0);
+ +      end_writeback(inode);
+ +
+ +      InodeCount--;
+ +
+ +      if (inode->i_private) {
+ +              struct inode_data *id = inode->i_private;
+ +
+ +              DbgPrint("inode=0x%p ino=%d Scope=0x%p Name=%s", inode, inode->i_ino, id->Scope, id->Name);
+ +
+ +              novfs_free_inode_cache(inode);
+ +
+ +              mutex_lock(&InodeList_lock);
+ +              list_del(&id->IList);
+ +              mutex_unlock(&InodeList_lock);
+ +
+ +              kfree(inode->i_private);
+ +              inode->i_private = NULL;
+ +
+ +              remove_inode_hash(inode);
+ +
+ +      } else {
+ +              DbgPrint("inode=0x%p ino=%d", inode, inode->i_ino);
+ +      }
+ +}
+ +
+ +/* Called when /proc/mounts is read */
+ +int novfs_show_options(struct seq_file *s, struct vfsmount *m)
+ +{
+ +      char *buf, *path, *tmp;
+ +
+ +      buf = kmalloc(PATH_LENGTH_BUFFER, GFP_KERNEL);
+ +      if (buf) {
+ +              struct path my_path;
+ +              my_path.mnt = m;
+ +              my_path.dentry = m->mnt_root;
+ +              path = d_path(&my_path, buf, PATH_LENGTH_BUFFER);
+ +              if (path) {
+ +                      if (!novfs_current_mnt || (novfs_current_mnt && strcmp(novfs_current_mnt, path))) {
+ +                              DbgPrint("%.*s %.*s %s",
+ +                                       m->mnt_root->d_name.len,
+ +                                       m->mnt_root->d_name.name,
+ +                                       m->mnt_mountpoint->d_name.len, m->mnt_mountpoint->d_name.name, path);
+ +                              tmp = kmalloc(PATH_LENGTH_BUFFER - (int)(path - buf), GFP_KERNEL);
+ +                              if (tmp) {
+ +                                      strcpy(tmp, path);
+ +                                      path = novfs_current_mnt;
+ +                                      novfs_current_mnt = tmp;
+ +                                      novfs_daemon_set_mnt_point(novfs_current_mnt);
+ +
+ +                                      if (path) {
+ +                                              kfree(path);
+ +                                      }
+ +                              }
+ +                      }
+ +              }
+ +              kfree(buf);
+ +      }
+ +      return (0);
+ +}
+ +
+ +/*   Called when statfs(2) system called. */
+ +int novfs_statfs(struct dentry *de, struct kstatfs *buf)
+ +{
+ +      uint64_t td, fd, te, fe;
+ +      struct super_block *sb = de->d_sb;
+ +
+ +      DbgPrint("");
+ +
+ +      td = fd = te = fe = 0;
+ +
+ +      novfs_scope_get_userspace(&td, &fd, &te, &fe);
+ +
+ +      DbgPrint("td=%llu", td);
+ +      DbgPrint("fd=%llu", fd);
+ +      DbgPrint("te=%llu", te);
+ +      DbgPrint("fe=%llu", fd);
+ +      /* fix for Nautilus */
+ +      if (sb->s_blocksize == 0)
+ +              sb->s_blocksize = 4096;
+ +
+ +      buf->f_type = sb->s_magic;
+ +      buf->f_bsize = sb->s_blocksize;
+ +      buf->f_namelen = NW_MAX_PATH_LENGTH;
+ +      buf->f_blocks = (sector_t) (td + (uint64_t) (sb->s_blocksize - 1)) >> (uint64_t) sb->s_blocksize_bits;
+ +      buf->f_bfree = (sector_t) fd >> (uint64_t) sb->s_blocksize_bits;
+ +      buf->f_bavail = (sector_t) buf->f_bfree;
+ +      buf->f_files = (sector_t) te;
+ +      buf->f_ffree = (sector_t) fe;
+ +      buf->f_frsize = sb->s_blocksize;
+ +      if (te > 0xffffffff)
+ +              buf->f_files = 0xffffffff;
+ +
+ +      if (fe > 0xffffffff)
+ +              buf->f_ffree = 0xffffffff;
+ +
+ +      DbgPrint("f_type:    0x%x", buf->f_type);
+ +      DbgPrint("f_bsize:   %u", buf->f_bsize);
+ +      DbgPrint("f_namelen: %d", buf->f_namelen);
+ +      DbgPrint("f_blocks:  %llu", buf->f_blocks);
+ +      DbgPrint("f_bfree:   %llu", buf->f_bfree);
+ +      DbgPrint("f_bavail:  %llu", buf->f_bavail);
+ +      DbgPrint("f_files:   %llu", buf->f_files);
+ +      DbgPrint("f_ffree:   %llu", buf->f_ffree);
+ +      DbgPrint("f_frsize:  %u", buf->f_frsize);
+ +
+ +      return 0;
+ +}
+ +
+ +struct inode *novfs_get_inode(struct super_block *sb, int mode, int dev, uid_t Uid, ino_t ino, struct qstr *name)
+ +{
+ +      struct inode *inode = new_inode(sb);
+ +
+ +      if (inode) {
+ +              InodeCount++;
+ +              inode->i_mode = mode;
+ +              inode->i_uid = Uid;
+ +              inode->i_gid = 0;
+ +              inode->i_blkbits = sb->s_blocksize_bits;
+ +              inode->i_blocks = 0;
+ +              inode->i_rdev = 0;
+ +              inode->i_ino = (ino) ? ino : (ino_t) atomic_inc_return(&novfs_Inode_Number);
+ +              if (novfs_page_cache) {
+ +                      inode->i_mapping->a_ops = &novfs_aops;
+ +              } else {
+ +                      inode->i_mapping->a_ops = &novfs_nocache_aops;
+ +              }
+ +              inode->i_mapping->backing_dev_info = &novfs_backing_dev_info;
+ +              inode->i_atime.tv_sec = 0;
+ +              inode->i_atime.tv_nsec = 0;
+ +              inode->i_mtime = inode->i_ctime = inode->i_atime;
+ +
+ +              DbgPrint("Inode=0x%p I_ino=%d len=%d", inode, inode->i_ino, name->len);
+ +
+ +              if (NULL != (inode->i_private = kmalloc(sizeof(struct inode_data) + name->len, GFP_KERNEL))) {
+ +                      struct inode_data *id;
+ +                      id = inode->i_private;
+ +
+ +                      DbgPrint("i_private 0x%p", id);
+ +
+ +                      id->Scope = NULL;
+ +                      id->Flags = 0;
+ +                      id->Inode = inode;
+ +
+ +                      id->cntDC = 1;
+ +
+ +                      INIT_LIST_HEAD(&id->DirCache);
+ +                      mutex_init(&id->DirCacheLock);
+ +
+ +                      id->FileHandle = 0;
+ +                      id->CacheFlag = 0;
+ +
+ +                      mutex_lock(&InodeList_lock);
+ +
+ +                      list_add_tail(&id->IList, &InodeList);
+ +                      mutex_unlock(&InodeList_lock);
+ +
+ +                      id->Name[0] = '\0';
+ +
+ +                      memcpy(id->Name, name->name, name->len);
+ +                      id->Name[name->len] = '\0';
+ +
+ +                      DbgPrint("name %s", id->Name);
+ +              }
+ +
+ +              insert_inode_hash(inode);
+ +
+ +              switch (mode & S_IFMT) {
+ +
+ +              case S_IFREG:
+ +                      inode->i_op = &novfs_file_inode_operations;
+ +                      inode->i_fop = &novfs_file_operations;
+ +                      break;
+ +
+ +              case S_IFDIR:
+ +                      inode->i_op = &novfs_inode_operations;
+ +                      inode->i_fop = &novfs_dir_operations;
+ +                      inode->i_blkbits = 0;
+ +                      break;
+ +
+ +              default:
+ +                      init_special_inode(inode, mode, dev);
+ +                      break;
+ +              }
+ +
+ +              DbgPrint("size=%lld", inode->i_size);
+ +              DbgPrint("mode=0%o", inode->i_mode);
+ +              DbgPrint("i_sb->s_blocksize=%d", inode->i_sb->s_blocksize);
+ +              DbgPrint("i_blkbits=%d", inode->i_blkbits);
+ +              DbgPrint("i_blocks=%d", inode->i_blocks);
+ +              DbgPrint("i_bytes=%d", inode->i_bytes);
+ +      }
+ +
+ +      DbgPrint("0x%p %d", inode, inode->i_ino);
+ +      return (inode);
+ +}
+ +
+ +int novfs_fill_super(struct super_block *SB, void *Data, int Silent)
+ +{
+ +      struct inode *inode;
+ +      struct dentry *server, *tree;
+ +      struct qstr name;
+ +      struct novfs_entry_info info;
+ +
+ +      SB->s_blocksize = PAGE_CACHE_SIZE;
+ +      SB->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ +      SB->s_maxbytes = MAX_LFS_FILESIZE;      /* Max file size */
+ +      SB->s_op = &novfs_ops;
+ +      SB->s_flags |= (MS_NODIRATIME | MS_NODEV | MS_POSIXACL);
+ +      SB->s_magic = NOVFS_MAGIC;
+ +
+ +      name.len = 1;
+ +      name.name = "/";
+ +
+ +      inode = novfs_get_inode(SB, S_IFDIR | 01777, 0, 0, 0, &name);
+ +      if (!inode) {
+ +              return (-ENOMEM);
+ +      }
+ +
+ +      novfs_root = d_alloc_root(inode);
+ +
+ +      if (!novfs_root) {
+ +              iput(inode);
+ +              return (-ENOMEM);
+ +      }
+ +      novfs_root->d_time = jiffies + (novfs_update_timeout * HZ);
+ +
+ +      inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ +
+ +      SB->s_root = novfs_root;
+ +
+ +      DbgPrint("root 0x%p", novfs_root);
+ +
+ +      if (novfs_root) {
+ +              novfs_root->d_op = &novfs_dentry_operations;
+ +
+ +              name.name = SERVER_DIRECTORY_NAME;
+ +              name.len = strlen(SERVER_DIRECTORY_NAME);
+ +              name.hash = novfs_internal_hash(&name);
+ +
+ +              inode = novfs_get_inode(SB, S_IFDIR | 01777, 0, 0, 0, &name);
+ +              if (inode) {
+ +                      info.mode = inode->i_mode;
+ +                      info.namelength = 0;
+ +                      inode->i_size = info.size = 0;
+ +                      inode->i_uid = info.uid = 0;
+ +                      inode->i_gid = info.gid = 0;
+ +                      inode->i_atime = info.atime = inode->i_ctime = info.ctime = inode->i_mtime = info.mtime = CURRENT_TIME;
+ +
+ +                      server = d_alloc(novfs_root, &name);
+ +                      if (server) {
+ +                              server->d_op = &novfs_dentry_operations;
+ +                              server->d_time = 0xffffffff;
+ +                              d_add(server, inode);
+ +                              DbgPrint("d_add %s 0x%p", SERVER_DIRECTORY_NAME, server);
+ +                              novfs_add_inode_entry(novfs_root->d_inode, &name, inode->i_ino, &info);
+ +                      }
+ +              }
+ +
+ +              name.name = TREE_DIRECTORY_NAME;
+ +              name.len = strlen(TREE_DIRECTORY_NAME);
+ +              name.hash = novfs_internal_hash(&name);
+ +
+ +              inode = novfs_get_inode(SB, S_IFDIR | 01777, 0, 0, 0, &name);
+ +              if (inode) {
+ +                      info.mode = inode->i_mode;
+ +                      info.namelength = 0;
+ +                      inode->i_size = info.size = 0;
+ +                      inode->i_uid = info.uid = 0;
+ +                      inode->i_gid = info.gid = 0;
+ +                      inode->i_atime = info.atime = inode->i_ctime = info.ctime = inode->i_mtime = info.mtime = CURRENT_TIME;
+ +                      tree = d_alloc(novfs_root, &name);
+ +                      if (tree) {
+ +                              tree->d_op = &novfs_dentry_operations;
+ +                              tree->d_time = 0xffffffff;
+ +
+ +                              d_add(tree, inode);
+ +                              DbgPrint("d_add %s 0x%p", TREE_DIRECTORY_NAME, tree);
+ +                              novfs_add_inode_entry(novfs_root->d_inode, &name, inode->i_ino, &info);
+ +                      }
+ +              }
+ +      }
+ +
+ +      return (0);
+ +}
+ +
+ +static struct dentry *novfs_mount(struct file_system_type *fs_type, int flags,
+ +                     const char *dev_name, void *data)
+ +{
+ +      DbgPrint("Fstype=0x%x Dev_name=%s", fs_type, dev_name);
+ +      return mount_nodev(fs_type, flags, data, novfs_fill_super);
+ +}
+ +
+ +static void novfs_kill_sb(struct super_block *super)
+ +{
+ +      shrink_dcache_sb(super);
+ +      kill_litter_super(super);
+ +}
+ +
++/* This should be removed */
++#ifndef kernel_locked
++#define kernel_locked() (current->lock_depth >= 0)
++#endif
++
++ssize_t novfs_Control_read(struct file *file, char *buf, size_t nbytes, loff_t * ppos)
++{
++      ssize_t retval = 0;
++
++      DbgPrint("kernel_locked 0x%x", kernel_locked());
++
++      return retval;
++}
++
++ssize_t novfs_Control_write(struct file * file, const char *buf, size_t nbytes, loff_t * ppos)
++{
++      ssize_t retval = 0;
++
++      DbgPrint("kernel_locked 0x%x", kernel_locked());
++      if (buf && nbytes) {
++      }
++
++      return (retval);
++}
++
+ +static struct file_system_type novfs_fs_type = {
+ +      .name = "novfs",
+ +      .mount = novfs_mount,
+ +      .kill_sb = novfs_kill_sb,
+ +      .owner = THIS_MODULE,
+ +};
+ +
+ +int __init init_novfs(void)
+ +{
+ +      int retCode;
+ +
+ +      lastDir[0] = 0;
+ +      lastTime = get_nanosecond_time();
+ +
+ +      inHAX = 0;
+ +      inHAXTime = get_nanosecond_time();
+ +
+ +      retCode = bdi_init(&novfs_backing_dev_info);
+ +
+ +      if (!retCode)
+ +              retCode = bdi_register(&novfs_backing_dev_info, NULL, "novfs-map");
+ +      if (retCode) {
+ +              bdi_destroy(&novfs_backing_dev_info);
+ +              goto bdi_fail;
+ +      }
+ +
+ +      retCode = novfs_proc_init();
+ +
+ +      novfs_profile_init();
+ +
+ +      if (!retCode) {
+ +              DbgPrint("%s %s %s", __DATE__, __TIME__, NOVFS_VERSION_STRING);
+ +              novfs_daemon_queue_init();
+ +              novfs_scope_init();
+ +              retCode = register_filesystem(&novfs_fs_type);
+ +              if (retCode) {
+ +                      novfs_proc_exit();
+ +                      novfs_daemon_queue_exit();
+ +                      novfs_scope_exit();
+ +              }
+ +      }
+ +
+ +bdi_fail:
+ +      return (retCode);
+ +}
+ +
+ +void __exit exit_novfs(void)
+ +{
+ +      novfs_scope_exit();
+ +      novfs_daemon_queue_exit();
+ +      novfs_profile_exit();
+ +      novfs_proc_exit();
+ +      unregister_filesystem(&novfs_fs_type);
+ +
+ +      if (novfs_current_mnt) {
+ +              kfree(novfs_current_mnt);
+ +              novfs_current_mnt = NULL;
+ +      }
+ +
+ +      bdi_destroy(&novfs_backing_dev_info);
+ +}
+ +
+ +int novfs_lock_inode_cache(struct inode *i)
+ +{
+ +      struct inode_data *id;
+ +      int retVal = 0;
+ +
+ +      DbgPrint("0x%p", i);
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              mutex_lock(&id->DirCacheLock);
+ +              retVal = 1;
+ +      }
+ +      DbgPrint("return %d", retVal);
+ +      return (retVal);
+ +}
+ +
+ +void novfs_unlock_inode_cache(struct inode *i)
+ +{
+ +      struct inode_data *id;
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              mutex_unlock(&id->DirCacheLock);
+ +      }
+ +}
+ +
+ +int novfs_enumerate_inode_cache(struct inode *i, struct list_head **iteration, ino_t * ino, struct novfs_entry_info *info)
+ +/*
+ + *  Arguments:   struct inode *i - pointer to directory inode
+ + *
+ + *  Returns:     0 - item found
+ + *              -1 - done
+ + *
+ + *  Abstract:    Unlocks inode cache.
+ + *
+ + *  Notes:       DirCacheLock should be held before calling this routine.
+ + *========================================================================*/
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc;
+ +      struct list_head *l = NULL;
+ +      int retVal = -1;
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              if ((NULL == iteration) || (NULL == *iteration)) {
+ +                      l = id->DirCache.next;
+ +              } else {
+ +                      l = *iteration;
+ +              }
+ +
+ +              if (l == &id->DirCache) {
+ +                      l = NULL;
+ +              } else {
+ +                      dc = list_entry(l, struct novfs_dir_cache, list);
+ +
+ +                      *ino = dc->ino;
+ +                      info->type = 0;
+ +                      info->mode = dc->mode;
+ +                      info->size = dc->size;
+ +                      info->atime = dc->atime;
+ +                      info->mtime = dc->mtime;
+ +                      info->ctime = dc->ctime;
+ +                      info->namelength = dc->nameLen;
+ +                      memcpy(info->name, dc->name, dc->nameLen);
+ +                      info->name[dc->nameLen] = '\0';
+ +                      retVal = 0;
+ +
+ +                      l = l->next;
+ +              }
+ +      }
+ +      *iteration = l;
+ +      return (retVal);
+ +}
+ +
+ +/* DirCacheLock should be held before calling this routine. */
+ +int novfs_get_entry(struct inode *i, struct qstr *name, ino_t * ino, struct novfs_entry_info *info)
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc;
+ +      int retVal = -1;
+ +      char *n = "<NULL>";
+ +      int nl = 6;
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              if (name && name->len) {
+ +                      n = (char *)name->name;
+ +                      nl = name->len;
+ +              }
+ +
+ +              dc = novfs_lookup_inode_cache(i, name, *ino);
+ +              if (dc) {
+ +                      dc->flags |= ENTRY_VALID;
+ +                      retVal = 0;
+ +                      *ino = dc->ino;
+ +                      info->type = 0;
+ +                      info->mode = dc->mode;
+ +                      info->size = dc->size;
+ +                      info->atime = dc->atime;
+ +                      info->mtime = dc->mtime;
+ +                      info->ctime = dc->ctime;
+ +                      info->namelength = dc->nameLen;
+ +                      memcpy(info->name, dc->name, dc->nameLen);
+ +                      info->name[dc->nameLen] = '\0';
+ +                      retVal = 0;
+ +              }
+ +
+ +              DbgPrint("inode: 0x%p; name: %.*s; ino: %d\n", i, nl, n, *ino);
+ +      }
+ +      DbgPrint("return %d", retVal);
+ +      return (retVal);
+ +}
+ +
+ + /*DirCacheLock should be held before calling this routine. */
+ +int novfs_get_entry_by_pos(struct inode *i, loff_t pos, ino_t * ino, struct novfs_entry_info *info)
+ +{
+ +      int retVal = -1;
+ +      loff_t count = 0;
+ +      loff_t i_pos = pos - 2;
+ +      struct list_head *inter = NULL;
+ +      while (!novfs_enumerate_inode_cache(i, &inter, ino, info)) {
+ +              DbgPrint("info->name = %s", info->name);
+ +              if (count == i_pos) {
+ +                      retVal = 0;
+ +                      break;
+ +              } else
+ +                      count++;
+ +      }
+ +
+ +      return retVal;
+ +}
+ +
+ +/* DirCacheLock should be held before calling this routine. */
+ +int novfs_get_entry_time(struct inode *i, struct qstr *name, ino_t * ino, struct novfs_entry_info *info, u64 * EntryTime)
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc;
+ +      int retVal = -1;
+ +      char *n = "<NULL>";
+ +      int nl = 6;
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              if (name && name->len) {
+ +                      n = (char *)name->name;
+ +                      nl = name->len;
+ +              }
+ +              DbgPrint("inode: 0x%p; name:  %.*s; ino: %d", i, nl, n, *ino);
+ +
+ +              dc = novfs_lookup_inode_cache(i, name, *ino);
+ +              if (dc) {
+ +                      retVal = 0;
+ +                      *ino = dc->ino;
+ +                      info->type = 0;
+ +                      info->mode = dc->mode;
+ +                      info->size = dc->size;
+ +                      info->atime = dc->atime;
+ +                      info->mtime = dc->mtime;
+ +                      info->ctime = dc->ctime;
+ +                      info->namelength = dc->nameLen;
+ +                      memcpy(info->name, dc->name, dc->nameLen);
+ +                      info->name[dc->nameLen] = '\0';
+ +                      if (EntryTime) {
+ +                              *EntryTime = dc->jiffies;
+ +                      }
+ +                      retVal = 0;
+ +              }
+ +      }
+ +      DbgPrint("return %d", retVal);
+ +      return (retVal);
+ +}
+ +
+ +/*
+ + *  Abstract:    This routine will return the first entry on the list
+ + *               and then remove it.
+ + *
+ + *  Notes:       DirCacheLock should be held before calling this routine.
+ + *
+ + */
+ +int novfs_get_remove_entry(struct inode *i, ino_t * ino, struct novfs_entry_info *info)
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc;
+ +      struct list_head *l = NULL;
+ +      int retVal = -1;
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              l = id->DirCache.next;
+ +
+ +              if (l != &id->DirCache) {
+ +                      dc = list_entry(l, struct novfs_dir_cache, list);
+ +
+ +                      *ino = dc->ino;
+ +                      info->type = 0;
+ +                      info->mode = dc->mode;
+ +                      info->size = dc->size;
+ +                      info->atime = dc->atime;
+ +                      info->mtime = dc->mtime;
+ +                      info->ctime = dc->ctime;
+ +                      info->namelength = dc->nameLen;
+ +                      memcpy(info->name, dc->name, dc->nameLen);
+ +                      info->name[dc->nameLen] = '\0';
+ +                      retVal = 0;
+ +
+ +                      list_del(&dc->list);
+ +                      kfree(dc);
+ +                      DCCount--;
+ +
+ +                      id->cntDC--;
+ +              }
+ +      }
+ +      return (retVal);
+ +}
+ +
+ +/*
+ + *  Abstract:    Marks all entries in the directory cache as invalid.
+ + *
+ + *  Notes:       DirCacheLock should be held before calling this routine.
+ + *
+ + *========================================================================*/
+ +void novfs_invalidate_inode_cache(struct inode *i)
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc;
+ +      struct list_head *l;
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              list_for_each(l, &id->DirCache) {
+ +                      dc = list_entry(l, struct novfs_dir_cache, list);
+ +                      dc->flags &= ~ENTRY_VALID;
+ +              }
+ +      }
+ +}
+ +
+ +/*++======================================================================*/
+ +struct novfs_dir_cache *novfs_lookup_inode_cache(struct inode *i, struct qstr *name, ino_t ino)
+ +/*
+ + *  Returns:     struct novfs_dir_cache entry if match
+ + *               NULL - if there is no match.
+ + *
+ + *  Abstract:    Checks a inode directory to see if there are any enties
+ + *               matching name or ino.  If name is specified then ino is
+ + *               not used.  ino is use if name is not specified.
+ + *
+ + *  Notes:       DirCacheLock should be held before calling this routine.
+ + *
+ + *========================================================================*/
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc, *retVal = NULL;
+ +      struct list_head *l;
+ +      char *n = "<NULL>";
+ +      int nl = 6;
+ +      int hash = 0;
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              if (name && name->name) {
+ +                      nl = name->len;
+ +                      n = (char *)name->name;
+ +                      hash = name->hash;
+ +              }
+ +              DbgPrint("inode: 0x%p; name:  %.*s; hash:  0x%x;\n" "   len:   %d; ino:   %d", i, nl, n, hash, nl, ino);
+ +
+ +              list_for_each(l, &id->DirCache) {
+ +                      dc = list_entry(l, struct novfs_dir_cache, list);
+ +                      if (name) {
+ +
+ +/*         DbgPrint("novfs_lookup_inode_cache: 0x%p\n" \
+ +                  "   ino:   %d\n" \
+ +                  "   hash:  0x%x\n" \
+ +                  "   len:   %d\n" \
+ +                  "   name:  %.*s\n",
+ +            dc, dc->ino, dc->hash, dc->nameLen, dc->nameLen, dc->name);
+ +*/
+ +                              if ((name->hash == dc->hash) &&
+ +                                  (name->len == dc->nameLen) && (0 == memcmp(name->name, dc->name, name->len))) {
+ +                                      retVal = dc;
+ +                                      break;
+ +                              }
+ +                      } else {
+ +                              if (ino == dc->ino) {
+ +                                      retVal = dc;
+ +                                      break;
+ +                              }
+ +                      }
+ +              }
+ +      }
+ +
+ +      DbgPrint("return 0x%p", retVal);
+ +      return (retVal);
+ +}
+ +
+ +/*
+ + * Checks a inode directory to see if there are any enties matching name
+ + * or ino.  If entry is found the valid bit is set.
+ + *
+ + * DirCacheLock should be held before calling this routine.
+ + */
+ +int novfs_lookup_validate(struct inode *i, struct qstr *name, ino_t ino)
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc;
+ +      int retVal = -1;
+ +      char *n = "<NULL>";
+ +      int nl = 6;
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              if (name && name->len) {
+ +                      n = (char *)name->name;
+ +                      nl = name->len;
+ +              }
+ +              DbgPrint("inode: 0x%p; name:  %.*s; ino:   %d", i, nl, n, ino);
+ +
+ +              dc = novfs_lookup_inode_cache(i, name, ino);
+ +              if (dc) {
+ +                      dc->flags |= ENTRY_VALID;
+ +                      retVal = 0;
+ +              }
+ +      }
+ +      return (retVal);
+ +}
+ +
+ +/*
+ + * Added entry to directory cache.
+ + *
+ + * DirCacheLock should be held before calling this routine.
+ + */
+ +int novfs_add_inode_entry(struct inode *i, struct qstr *name, ino_t ino, struct novfs_entry_info *info)
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *new;
+ +      int retVal = -ENOMEM;
+ +
+ +      //SClark
+ +      DbgPrint("i: %p", i);
+ +      if ((id = i->i_private)) {
+ +              DbgPrint("i->i_private: %p", id);
+ +              if (id->DirCache.next)
+ +                      DbgPrint("id->DirCache.next: %p", id->DirCache.next);
+ +      }
+ +      //SClark
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              new = kmalloc(sizeof(struct novfs_dir_cache) + name->len, GFP_KERNEL);
+ +              if (new) {
+ +                      id->cntDC++;
+ +
+ +                      DCCount++;
+ +                      DbgPrint("inode: 0x%p; id: 0x%p; DC: 0x%p; new: 0x%p; "
+ +                               "name:  %.*s; ino: %d; size: %lld; mode: 0x%x",
+ +                               i, id, &id->DirCache, new, name->len, name->name, ino, info->size, info->mode);
+ +
+ +                      retVal = 0;
+ +                      new->flags = ENTRY_VALID;
+ +                      new->jiffies = get_jiffies_64();
+ +                      new->size = info->size;
+ +                      new->mode = info->mode;
+ +                      new->atime = info->atime;
+ +                      new->mtime = info->mtime;
+ +                      new->ctime = info->ctime;
+ +                      new->ino = ino;
+ +                      new->hash = name->hash;
+ +                      new->nameLen = name->len;
+ +                      memcpy(new->name, name->name, name->len);
+ +                      new->name[new->nameLen] = '\0';
+ +                      list_add(&new->list, &id->DirCache);
+ +              }
+ +      }
+ +      return (retVal);
+ +}
+ +
+ +/*
+ + *  DirCacheLock should be held before calling this routine.
+ + */
+ +int novfs_update_entry(struct inode *i, struct qstr *name, ino_t ino, struct novfs_entry_info *info)
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc;
+ +      int retVal = -1;
+ +      char *n = "<NULL>";
+ +      int nl = 6;
+ +      char atime_buf[32];
+ +      char mtime_buf[32];
+ +      char ctime_buf[32];
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +
+ +              if (name && name->len) {
+ +                      n = (char *)name->name;
+ +                      nl = name->len;
+ +              }
+ +              ctime_r(&info->atime.tv_sec, atime_buf);
+ +              ctime_r(&info->mtime.tv_sec, mtime_buf);
+ +              ctime_r(&info->ctime.tv_sec, ctime_buf);
+ +              DbgPrint("inode: 0x%p; name: %.*s; ino: %d; size: %lld; "
+ +                       "atime: %s; mtime: %s; ctime: %s", i, nl, n, ino, info->size, atime_buf, mtime_buf, ctime_buf);
+ +
+ +              dc = novfs_lookup_inode_cache(i, name, ino);
+ +              if (dc) {
+ +                      retVal = 0;
+ +                      dc->flags = ENTRY_VALID;
+ +                      dc->jiffies = get_jiffies_64();
+ +                      dc->size = info->size;
+ +                      dc->mode = info->mode;
+ +                      dc->atime = info->atime;
+ +                      dc->mtime = info->mtime;
+ +                      dc->ctime = info->ctime;
+ +
+ +                      ctime_r(&dc->atime.tv_sec, atime_buf);
+ +                      ctime_r(&dc->mtime.tv_sec, mtime_buf);
+ +                      ctime_r(&dc->ctime.tv_sec, ctime_buf);
+ +                      DbgPrint("entry: 0x%p; flags: 0x%x; jiffies: %lld; "
+ +                               "ino: %d; size: %lld; mode: 0%o; atime: %s; "
+ +                               "mtime: %s %d; ctime: %s; hash: 0x%x; "
+ +                               " nameLen: %d; name: %s",
+ +                               dc, dc->flags, dc->jiffies, dc->ino, dc->size,
+ +                               dc->mode, atime_buf, mtime_buf, dc->mtime.tv_nsec, ctime_buf, dc->hash, dc->nameLen, dc->name);
+ +              }
+ +      }
+ +      DbgPrint("return %d", retVal);
+ +      return (retVal);
+ +}
+ +
+ +/*
+ + *  Removes entry from directory cache.  You can specify a name
+ + *  or an inode number.
+ + *
+ + *  DirCacheLock should be held before calling this routine.
+ + */
+ +void novfs_remove_inode_entry(struct inode *i, struct qstr *name, ino_t ino)
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc;
+ +      char *n = "<NULL>";
+ +      int nl = 6;
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              dc = novfs_lookup_inode_cache(i, name, ino);
+ +              if (dc) {
+ +                      if (name && name->name) {
+ +                              nl = name->len;
+ +                              n = (char *)name->name;
+ +                      }
+ +                      DbgPrint("inode: 0x%p; id: 0x%p; DC: 0x%p; "
+ +                               "name: %.*s; ino: %d entry: 0x%p "
+ +                               "[name: %.*s; ino: %d; next: 0x%p; "
+ +                               "prev: 0x%p]",
+ +                               i, id, &id->DirCache, nl, n, ino, dc,
+ +                               dc->nameLen, dc->name, dc->ino, dc->list.next, dc->list.prev);
+ +                      list_del(&dc->list);
+ +                      kfree(dc);
+ +                      DCCount--;
+ +
+ +                      id->cntDC--;
+ +              }
+ +      }
+ +}
+ +
+ +/*
+ + * Frees all invalid entries in the directory cache.
+ + *
+ + * DirCacheLock should be held before calling this routine.
+ + */
+ +void novfs_free_invalid_entries(struct inode *i)
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc;
+ +      struct list_head *l;
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              list_for_each(l, &id->DirCache) {
+ +                      dc = list_entry(l, struct novfs_dir_cache, list);
+ +                      if (0 == (dc->flags & ENTRY_VALID)) {
+ +                              DbgPrint("inode: 0x%p; id: 0x%p; entry: 0x%p; "
+ +                                       "name: %.*s; ino: %d", i, id, dc, dc->nameLen, dc->name, dc->ino);
+ +                              l = l->prev;
+ +                              list_del(&dc->list);
+ +                              kfree(dc);
+ +                              DCCount--;
+ +
+ +                              id->cntDC--;
+ +                      }
+ +              }
+ +      }
+ +}
+ +
+ +/*
+ + *  Frees all entries in the inode cache.
+ + *
+ + *  DirCacheLock should be held before calling this routine.
+ + */
+ +void novfs_free_inode_cache(struct inode *i)
+ +{
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc;
+ +      struct list_head *l;
+ +
+ +      if (i && (id = i->i_private) && id->DirCache.next) {
+ +              list_for_each(l, &id->DirCache) {
+ +                      dc = list_entry(l, struct novfs_dir_cache, list);
+ +                      l = l->prev;
+ +                      list_del(&dc->list);
+ +                      kfree(dc);
+ +                      DCCount--;
+ +
+ +                      id->cntDC--;
+ +              }
+ +      }
+ +}
+ +
+ +void novfs_dump_inode(void *pf)
+ +{
+ +      struct inode *inode;
+ +      void (*pfunc) (char *Fmt, ...) = pf;
+ +      struct inode_data *id;
+ +      struct novfs_dir_cache *dc;
+ +      struct list_head *il, *l;
+ +      char atime_buf[32];
+ +      char mtime_buf[32];
+ +      char ctime_buf[32];
+ +      unsigned long icnt = 0, dccnt = 0;
+ +
+ +      mutex_lock(&InodeList_lock);
+ +      list_for_each(il, &InodeList) {
+ +              id = list_entry(il, struct inode_data, IList);
+ +              inode = id->Inode;
+ +              if (inode) {
+ +                      icnt++;
+ +
+ +                      pfunc("Inode=0x%p I_ino=%d\n", inode, inode->i_ino);
+ +
+ +                      pfunc("   atime=%s\n", ctime_r(&inode->i_atime.tv_sec, atime_buf));
+ +                      pfunc("   ctime=%s\n", ctime_r(&inode->i_mtime.tv_sec, atime_buf));
+ +                      pfunc("   mtime=%s\n", ctime_r(&inode->i_ctime.tv_sec, atime_buf));
+ +                      pfunc("   size=%lld\n", inode->i_size);
+ +                      pfunc("   mode=0%o\n", inode->i_mode);
+ +                      pfunc("   count=0%o\n", atomic_read(&inode->i_count));
+ +              }
+ +
+ +              pfunc("   nofs_inode_data: 0x%p Name=%s Scope=0x%p\n", id, id->Name, id->Scope);
+ +
+ +              if (id->DirCache.next) {
+ +                      list_for_each(l, &id->DirCache) {
+ +                              dccnt++;
+ +                              dc = list_entry(l, struct novfs_dir_cache, list);
+ +                              ctime_r(&dc->atime.tv_sec, atime_buf);
+ +                              ctime_r(&dc->mtime.tv_sec, mtime_buf);
+ +                              ctime_r(&dc->ctime.tv_sec, ctime_buf);
+ +
+ +                              pfunc("   Cache Entry: 0x%p\n"
+ +                                    "      flags:   0x%x\n"
+ +                                    "      jiffies: %llu\n"
+ +                                    "      ino:     %u\n"
+ +                                    "      size:    %llu\n"
+ +                                    "      mode:    0%o\n"
+ +                                    "      atime:   %s\n"
+ +                                    "      mtime:   %s\n"
+ +                                    "      ctime:   %s\n"
+ +                                    "      hash:    0x%x\n"
+ +                                    "      len:     %d\n"
+ +                                    "      name:    %s\n",
+ +                                    dc, dc->flags, dc->jiffies,
+ +                                    dc->ino, dc->size, dc->mode,
+ +                                    atime_buf, mtime_buf, ctime_buf, dc->hash, dc->nameLen, dc->name);
+ +                      }
+ +              }
+ +      }
+ +      mutex_unlock(&InodeList_lock);
+ +
+ +      pfunc("Inodes: %d(%d) DirCache: %d(%d)\n", InodeCount, icnt, DCCount, dccnt);
+ +
+ +}
+ +
+ +module_init(init_novfs);
+ +module_exit(exit_novfs);
+ +
+ +MODULE_LICENSE("GPL");
+ +MODULE_AUTHOR("Novell Inc.");
+ +MODULE_DESCRIPTION("Novell NetWare Client for Linux");
+ +MODULE_VERSION(NOVFS_VERSION_STRING);
diff --cc fs/ocfs2/super.c

index cdbaf5e,5a521c7..d914e05
--- 1/fs/ocfs2/super.c
--- 2/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@@ -41,7 -41,6 +41,7 @@@
   #include <linux/mount.h>
   #include <linux/seq_file.h>
   #include <linux/quotaops.h>
- #include <linux/cleancache.h>
++#include <linux/precache.h>
   
   #define CREATE_TRACE_POINTS
   #include "ocfs2_trace.h"
@@@ -2353,7 -2352,6 +2353,7 @@@ static int ocfs2_initialize_super(struc
                 mlog_errno(status);
                 goto bail;
         }
-       cleancache_init_shared_fs((char *)&uuid_net_key, sb);
++      shared_precache_init(sb, &di->id2.i_super.s_uuid[0]);
   
   bail:
         return status;
diff --cc fs/partitions/check.c
Simple merge
diff --cc fs/partitions/efi.c

index e4998a3,19d6750..6296b40
--- 1/fs/partitions/efi.c
--- 2/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@@ -310,24 -310,6 +310,15 @@@ static int is_gpt_valid(struct parsed_p
                 goto fail;
         }
   
+ +      /* Check the GUID Partition Table header size */
+ +      if (le32_to_cpu((*gpt)->header_size) >
+ +                      bdev_logical_block_size(state->bdev)) {
+ +              pr_debug("GUID Partition Table Header size is wrong: %u > %u\n",
+ +                      le32_to_cpu((*gpt)->header_size),
+ +                      bdev_logical_block_size(state->bdev));
+ +              goto fail;
+ +      }
+ +
-       /* Check the GUID Partition Table header size */
-       if (le32_to_cpu((*gpt)->header_size) >
-                       bdev_logical_block_size(state->bdev)) {
-               pr_debug("GUID Partition Table Header size is wrong: %u > %u\n",
-                       le32_to_cpu((*gpt)->header_size),
-                       bdev_logical_block_size(state->bdev));
-               goto fail;
-       }
- 
         /* Check the GUID Partition Table CRC */
         origcrc = le32_to_cpu((*gpt)->header_crc32);
         (*gpt)->header_crc32 = 0;
diff --cc fs/proc/kcore.c

index d245cb2,d245cb2..263099b
--- 1/fs/proc/kcore.c
--- 2/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@@ -130,7 -130,7 +130,7 @@@ static void __kcore_update_ram(struct l
   }
   
   
--#ifdef CONFIG_HIGHMEM
++#if defined(CONFIG_HIGHMEM) || defined(CONFIG_XEN)
   /*
    * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
    * because memory hole is not as big as !HIGHMEM case.
@@@ -146,7 -146,7 +146,11 @@@ static int kcore_update_ram(void
         if (!ent)
                 return -ENOMEM;
         ent->addr = (unsigned long)__va(0);
++#ifdef CONFIG_HIGHMEM
         ent->size = max_low_pfn << PAGE_SHIFT;
++#else
++      ent->size = max_pfn << PAGE_SHIFT;
++#endif
         ent->type = KCORE_RAM;
         list_add(&ent->list, &head);
         __kcore_update_ram(&head);
diff --cc fs/reiserfs/super.c
Simple merge
diff --cc fs/super.c

index 7532b24,8a06881..cb47124
--- 1/fs/super.c
--- 2/fs/super.c
+++ b/fs/super.c
@@@ -31,7 -31,6 +31,7 @@@
   #include <linux/mutex.h>
   #include <linux/backing-dev.h>
   #include <linux/rculist_bl.h>
- #include <linux/cleancache.h>
++#include <linux/precache.h>
   #include "internal.h"
   
   
@@@ -113,7 -112,6 +113,9 @@@ static struct super_block *alloc_super(
                 s->s_maxbytes = MAX_NON_LFS;
                 s->s_op = &default_op;
                 s->s_time_gran = 1000000000;
-               s->cleancache_poolid = -1;
++#ifdef CONFIG_PRECACHE
++              s->precache_poolid = -1;
++#endif
         }
   out:
         return s;
@@@ -186,6 -183,6 +187,7 @@@ void deactivate_locked_super(struct sup
                  * inodes are flushed before we release the fs module.
                  */
                 rcu_barrier();
++              precache_flush_filesystem(s);
                 put_filesystem(fs);
                 put_super(s);
         } else {
diff --cc include/acpi/processor.h

index ba4928c,55192ac..21e45a7
--- 1/include/acpi/processor.h
--- 2/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@@ -17,6 -17,6 +17,12 @@@
   #define ACPI_PROCESSOR_MAX_THROTTLE   250     /* 25% */
   #define ACPI_PROCESSOR_MAX_DUTY_WIDTH 4
   
++#ifdef CONFIG_XEN
++#define NR_ACPI_CPUS                  (NR_CPUS < 256 ? 256 : NR_CPUS)
++#else
++#define NR_ACPI_CPUS                  NR_CPUS
++#endif /* CONFIG_XEN */
++
   #define ACPI_PDC_REVISION_ID          0x1
   
   #define ACPI_PSD_REV0_REVISION                0       /* Support for _PSD as in ACPI 3.0 */
@@@ -42,6 -42,6 +48,17 @@@
   
   struct acpi_processor_cx;
   
++#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
++struct acpi_csd_package {
++      acpi_integer num_entries;
++      acpi_integer revision;
++      acpi_integer domain;
++      acpi_integer coord_type;
++      acpi_integer num_processors;
++      acpi_integer index;
++} __attribute__ ((packed));
++#endif
++
   struct acpi_power_register {
         u8 descriptor;
         u16 length;
@@@ -63,18 -63,18 +80,36 @@@ struct acpi_processor_cx 
         u32 power;
         u32 usage;
         u64 time;
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         u8 bm_sts_skip;
++#else
++      /* Require raw information for external control logic */
++      struct acpi_power_register reg;
++      u32 csd_count;
++      struct acpi_csd_package *domain_info;
++#endif
         char desc[ACPI_CX_DESC_LEN];
   };
   
   struct acpi_processor_power {
++#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
++      union { /* 'dev' is actually only used for taking its address. */
++#endif
         struct cpuidle_device dev;
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         struct acpi_processor_cx *state;
         unsigned long bm_check_timestamp;
         u32 default_state;
++#else
++      struct {
++#endif
         int count;
         struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
++#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         int timer_broadcast_on_state;
++#else
++      }; };
++#endif
   };
   
   /* Performance Management */
@@@ -288,6 -288,6 +323,9 @@@ static inline void acpi_processor_ppc_e
   {
         return;
   }
++#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
++int acpi_processor_ppc_has_changed(struct acpi_processor *, int event_flag);
++#else
   static inline int acpi_processor_ppc_has_changed(struct acpi_processor *pr,
                                                                 int event_flag)
   {
@@@ -305,6 -305,6 +343,7 @@@ static inline int acpi_processor_get_bi
   {
         return -ENODEV;
   }
++#endif                                /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
   
   #endif                                /* CONFIG_CPU_FREQ */
   
@@@ -352,4 -359,4 +398,119 @@@ static inline void acpi_thermal_cpufreq
   }
   #endif
   
++/*
++ * Following are interfaces geared to external processor PM control
++ * logic like a VMM
++ */
++/* Events notified to external control logic */
++#define PROCESSOR_PM_INIT     1
++#define PROCESSOR_PM_CHANGE   2
++#define PROCESSOR_HOTPLUG     3
++
++/* Objects for the PM events */
++#define PM_TYPE_IDLE          0
++#define PM_TYPE_PERF          1
++#define PM_TYPE_THR           2
++#define PM_TYPE_MAX           3
++
++/* Processor hotplug events */
++#define HOTPLUG_TYPE_ADD      0
++#define HOTPLUG_TYPE_REMOVE   1
++
++#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
++struct processor_extcntl_ops {
++      /* Transfer processor PM events to external control logic */
++      int (*pm_ops[PM_TYPE_MAX])(struct acpi_processor *pr, int event);
++      /* Notify physical processor status to external control logic */
++      int (*hotplug)(struct acpi_processor *pr, int type);
++};
++extern const struct processor_extcntl_ops *processor_extcntl_ops;
++
++static inline int processor_cntl_external(void)
++{
++      return (processor_extcntl_ops != NULL);
++}
++
++static inline int processor_pm_external(void)
++{
++      return processor_cntl_external() &&
++              (processor_extcntl_ops->pm_ops[PM_TYPE_IDLE] != NULL);
++}
++
++static inline int processor_pmperf_external(void)
++{
++      return processor_cntl_external() &&
++              (processor_extcntl_ops->pm_ops[PM_TYPE_PERF] != NULL);
++}
++
++static inline int processor_pmthr_external(void)
++{
++      return processor_cntl_external() &&
++              (processor_extcntl_ops->pm_ops[PM_TYPE_THR] != NULL);
++}
++
++extern int processor_notify_external(struct acpi_processor *pr,
++                      int event, int type);
++extern int processor_extcntl_prepare(struct acpi_processor *pr);
++extern int acpi_processor_get_performance_info(struct acpi_processor *pr);
++extern int acpi_processor_get_psd(struct acpi_processor *pr);
++#else
++static inline int processor_cntl_external(void) {return 0;}
++static inline int processor_pm_external(void) {return 0;}
++static inline int processor_pmperf_external(void) {return 0;}
++static inline int processor_pmthr_external(void) {return 0;}
++static inline int processor_notify_external(struct acpi_processor *pr,
++                      int event, int type)
++{
++      return 0;
++}
++static inline int processor_extcntl_prepare(struct acpi_processor *pr)
++{
++      return 0;
++}
++#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
++
++#ifdef CONFIG_XEN
++static inline void xen_convert_pct_reg(struct xen_pct_register *xpct,
++      struct acpi_pct_register *apct)
++{
++      xpct->descriptor = apct->descriptor;
++      xpct->length     = apct->length;
++      xpct->space_id   = apct->space_id;
++      xpct->bit_width  = apct->bit_width;
++      xpct->bit_offset = apct->bit_offset;
++      xpct->reserved   = apct->reserved;
++      xpct->address    = apct->address;
++}
++
++static inline void xen_convert_pss_states(struct xen_processor_px *xpss,
++      struct acpi_processor_px *apss, int state_count)
++{
++      int i;
++      for(i=0; i<state_count; i++) {
++              xpss->core_frequency     = apss->core_frequency;
++              xpss->power              = apss->power;
++              xpss->transition_latency = apss->transition_latency;
++              xpss->bus_master_latency = apss->bus_master_latency;
++              xpss->control            = apss->control;
++              xpss->status             = apss->status;
++              xpss++;
++              apss++;
++      }
++}
++
++static inline void xen_convert_psd_pack(struct xen_psd_package *xpsd,
++      struct acpi_psd_package *apsd)
++{
++      xpsd->num_entries    = apsd->num_entries;
++      xpsd->revision       = apsd->revision;
++      xpsd->domain         = apsd->domain;
++      xpsd->coord_type     = apsd->coord_type;
++      xpsd->num_processors = apsd->num_processors;
++}
++
++extern int xen_pcpu_hotplug(int type);
++extern int xen_pcpu_index(uint32_t id, bool is_acpiid);
++#endif /* CONFIG_XEN */
++
   #endif
diff --cc include/asm-generic/vmlinux.lds.h
Simple merge
diff --cc include/linux/acpi.h

index 1deb2a7,a2e910e..09fc553
--- 1/include/linux/acpi.h
--- 2/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@@ -247,6 -248,6 +248,8 @@@ int acpi_check_region(resource_size_t s
   
   int acpi_resources_are_enforced(void);
   
++int acpi_pci_get_root_seg_bbn(char *hid, char *uid, int *seg, int *bbn);
++
   #ifdef CONFIG_PM_SLEEP
   void __init acpi_no_s4_hw_signature(void);
   void __init acpi_old_suspend_ordering(void);
diff --cc include/linux/aio.h

index 7a8db41,7a8db41..5f9ae64
--- 1/include/linux/aio.h
--- 2/include/linux/aio.h
+++ b/include/linux/aio.h
@@@ -199,6 -199,6 +199,12 @@@ struct kioctx 
   
         struct delayed_work     wq;
   
++#ifdef CONFIG_EPOLL
++      /* poll integration */
++      wait_queue_head_t       poll_wait;
++      struct file             *file;
++#endif
++
         struct rcu_head         rcu_head;
   };
   
diff --cc include/linux/blkdev.h
Simple merge
diff --cc include/linux/console.h

index 7453cfd,7453cfd..b4783e4
--- 1/include/linux/console.h
--- 2/include/linux/console.h
+++ b/include/linux/console.h
@@@ -73,6 -73,6 +73,7 @@@ extern const struct consw dummy_con;  /
   extern const struct consw vga_con;    /* VGA text console */
   extern const struct consw newport_con;        /* SGI Newport console  */
   extern const struct consw prom_con;   /* SPARC PROM console */
++extern int console_use_vt;
   
   int con_is_bound(const struct consw *csw);
   int register_con_driver(const struct consw *csw, int first, int last);
diff --cc include/linux/cpufreq.h

index 11be48e,9343dd3..eccc2ee
--- 1/include/linux/cpufreq.h
--- 2/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@@ -322,7 -319,7 +319,7 @@@ static inline unsigned int cpufreq_get(
   #endif
   
   /* query the last known CPU freq (in kHz). If zero, cpufreq couldn't detect it */
--#ifdef CONFIG_CPU_FREQ
++#if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
   unsigned int cpufreq_quick_get(unsigned int cpu);
   #else
   static inline unsigned int cpufreq_quick_get(unsigned int cpu)
diff --cc include/linux/device.h
Simple merge
diff --cc include/linux/elfnote.h

index 278e3ef,278e3ef..9f9816a
--- 1/include/linux/elfnote.h
--- 2/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@@ -52,7 -52,7 +52,7 @@@
   4484:.balign 4                                ;       \
   .popsection                           ;
   
--#define ELFNOTE(name, type, desc)             \
++#define ELFNOTE(name, type, desc...)          \
         ELFNOTE_START(name, type, "")           \
                 desc                    ;       \
         ELFNOTE_END
diff --cc include/linux/fs.h

index 0278c38,cdf9495..f4ee548
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -276,14 -272,7 +274,13 @@@ struct inodes_stat_t 
   #define IS_PRIVATE(inode)     ((inode)->i_flags & S_PRIVATE)
   #define IS_IMA(inode)         ((inode)->i_flags & S_IMA)
   #define IS_AUTOMOUNT(inode)   ((inode)->i_flags & S_AUTOMOUNT)
- #define IS_NOSEC(inode)               ((inode)->i_flags & S_NOSEC)
   
+ +/*
+ + * IS_ACL() tells the VFS to not apply the umask
+ + * and use iop->check_acl for acl permission checks when defined.
+ + */
+ +#define IS_ACL(inode)         __IS_FLG(inode, MS_POSIXACL | MS_RICHACL)
+ +
   /* the read-only stuff doesn't really belong here, but any other place is
      probably as bad and I don't want to create yet another include file. */
   
@@@ -1419,6 -1410,6 +1418,9 @@@ struct super_block 
         /* Granularity of c/m/atime in ns.
            Cannot be worse than a second */
         u32                s_time_gran;
++#ifdef CONFIG_PRECACHE
++      u32                precache_poolid;
++#endif
   
         /*
          * The next field is for VFS *only*. No filesystems have any business
diff --cc include/linux/genhd.h

index 2dde681,d764a42..bb53870
--- 1/include/linux/genhd.h
--- 2/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@@ -126,8 -127,6 +127,7 @@@ struct hd_struct 
   #define GENHD_FL_SUPPRESS_PARTITION_INFO      32
   #define GENHD_FL_EXT_DEVT                     64 /* allow extended devt */
   #define GENHD_FL_NATIVE_CAPACITY              128
- #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE   256
- #define GENHD_FL_NO_PARTITION_SCAN            512
++#define GENHD_FL_NO_PARTITION_SCAN            256
   
   enum {
         DISK_EVENT_MEDIA_CHANGE                 = 1 << 0, /* media changed */
diff --cc include/linux/highmem.h

index 3a93f73,3a93f73..aac1d61
--- 1/include/linux/highmem.h
--- 2/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@@ -178,12 -178,12 +178,14 @@@ alloc_zeroed_user_highpage_movable(stru
         return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr);
   }
   
++#ifndef __HAVE_ARCH_CLEAR_HIGHPAGE
   static inline void clear_highpage(struct page *page)
   {
         void *kaddr = kmap_atomic(page, KM_USER0);
         clear_page(kaddr);
         kunmap_atomic(kaddr, KM_USER0);
   }
++#endif
   
   static inline void zero_user_segments(struct page *page,
         unsigned start1, unsigned end1,
@@@ -237,6 -237,6 +239,8 @@@ static inline void copy_user_highpage(s
   
   #endif
   
++#ifndef __HAVE_ARCH_COPY_HIGHPAGE
++
   static inline void copy_highpage(struct page *to, struct page *from)
   {
         char *vfrom, *vto;
@@@ -248,4 -248,4 +252,6 @@@
         kunmap_atomic(vfrom, KM_USER0);
   }
   
++#endif
++
   #endif /* _LINUX_HIGHMEM_H */
diff --cc include/linux/interrupt.h

index 6c12989,bea0ac7..16b9575
--- 1/include/linux/interrupt.h
--- 2/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@@ -375,6 -375,6 +375,11 @@@ static inline int disable_irq_wake(unsi
   }
   #endif /* CONFIG_GENERIC_HARDIRQS */
   
++#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED
++int irq_ignore_unhandled(unsigned int irq);
++#else
++#define irq_ignore_unhandled(irq) 0
++#endif
   
   #ifdef CONFIG_IRQ_FORCED_THREADING
   extern bool force_irqthreads;
diff --cc include/linux/kernel.h
Simple merge
diff --cc include/linux/kexec.h

index c2478a3,c2478a3..4faa9c6
--- 1/include/linux/kexec.h
--- 2/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@@ -46,6 -46,6 +46,13 @@@
                             KEXEC_CORE_NOTE_NAME_BYTES +                \
                             KEXEC_CORE_NOTE_DESC_BYTES )
   
++#ifndef KEXEC_ARCH_HAS_PAGE_MACROS
++#define kexec_page_to_pfn(page)  page_to_pfn(page)
++#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
++#define kexec_virt_to_phys(addr) virt_to_phys(addr)
++#define kexec_phys_to_virt(addr) phys_to_virt(addr)
++#endif
++
   /*
    * This structure is used to hold the arguments that are used when loading
    * kernel binaries.
@@@ -112,6 -112,6 +119,12 @@@ struct kimage 
   extern void machine_kexec(struct kimage *image);
   extern int machine_kexec_prepare(struct kimage *image);
   extern void machine_kexec_cleanup(struct kimage *image);
++#ifdef CONFIG_XEN
++extern int xen_machine_kexec_load(struct kimage *image);
++extern void xen_machine_kexec_unload(struct kimage *image);
++extern void xen_machine_kexec_setup_resources(void);
++extern void xen_machine_kexec_register_resources(struct resource *res);
++#endif
   extern asmlinkage long sys_kexec_load(unsigned long entry,
                                         unsigned long nr_segments,
                                         struct kexec_segment __user *segments,
@@@ -192,8 -192,8 +205,15 @@@ extern struct kimage *kexec_crash_image
   #define VMCOREINFO_BYTES           (4096)
   #define VMCOREINFO_NOTE_NAME       "VMCOREINFO"
   #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
++#if !defined(CONFIG_XEN) || !defined(CONFIG_X86)
   #define VMCOREINFO_NOTE_SIZE       (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES \
                                     + VMCOREINFO_NOTE_NAME_BYTES)
++#else
++#define VMCOREINFO_NOTE_SIZE       ALIGN(KEXEC_NOTE_HEAD_BYTES*2 \
++                                       + VMCOREINFO_BYTES \
++                                       + VMCOREINFO_NOTE_NAME_BYTES, \
++                                       PAGE_SIZE)
++#endif
   
   /* Location of a reserved region to hold the crash kernel.
    */
diff --cc include/linux/kmod.h

index d4a5c84,6efd7a7..7f3dbcb
--- 1/include/linux/kmod.h
--- 2/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@@ -110,12 -109,9 +109,14 @@@ call_usermodehelper(char *path, char **
                                        NULL, NULL, NULL);
   }
   
- extern struct ctl_table usermodehelper_table[];
- 
   extern void usermodehelper_init(void);
   
++#ifdef CONFIG_PM_SLEEP
   extern int usermodehelper_disable(void);
   extern void usermodehelper_enable(void);
+ +extern bool usermodehelper_is_disabled(void);
++#else
++static inline bool usermodehelper_is_disabled(void) { return false; }
++#endif
   
   #endif /* __LINUX_KMOD_H__ */
diff --cc include/linux/kvm_host.h
Simple merge
diff --cc include/linux/lsm_audit.h
Simple merge
diff --cc include/linux/mm.h

index c7ed0a4,6507dde..d6fbb93
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -113,7 -113,7 +113,12 @@@ extern unsigned int kobjsize(const voi
   
   #define VM_CAN_NONLINEAR 0x08000000   /* Has ->fault & does nonlinear pages */
   #define VM_MIXEDMAP   0x10000000      /* Can contain "struct page" and pure PFN pages */
++#ifndef CONFIG_XEN
   #define VM_SAO                0x20000000      /* Strong Access Ordering (powerpc) */
++#else
++#define VM_SAO                0
++#define VM_FOREIGN    0x20000000      /* Has pages belonging to another VM */
++#endif
   #define VM_PFN_AT_MMAP        0x40000000      /* PFNMAP vma that is fully mapped at mmap time */
   #define VM_MERGEABLE  0x80000000      /* KSM may merge identical pages */
   
@@@ -142,6 -142,6 +147,12 @@@
    */
   #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
   
++#ifdef CONFIG_XEN
++struct vm_foreign_map {
++      struct page **map;
++};
++#endif
++
   /*
    * mapping from the currently active vm_flags protection bits (the
    * low four bits) to a page protection mask..
@@@ -213,6 -212,6 +223,17 @@@ struct vm_operations_struct 
          */
         int (*access)(struct vm_area_struct *vma, unsigned long addr,
                       void *buf, int len, int write);
++
++#ifdef CONFIG_XEN
++      /* Area-specific function for clearing the PTE at @ptep. Returns the
++       * original value of @ptep. */
++      pte_t (*zap_pte)(struct vm_area_struct *vma,
++                       unsigned long addr, pte_t *ptep, int is_fullmm);
++
++      /* called before close() to indicate no more pages should be mapped */
++      void (*unmap)(struct vm_area_struct *area);
++#endif
++
   #ifdef CONFIG_NUMA
         /*
          * set_policy() op must add a reference to any non-NULL @new mempolicy
diff --cc include/linux/module.h
Simple merge
diff --cc include/linux/nmi.h

index 2d304ef,c536f85..16b477f
--- 1/include/linux/nmi.h
--- 2/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@@ -18,6 -18,6 +18,9 @@@
   #include <asm/nmi.h>
   extern void touch_nmi_watchdog(void);
   #else
++#ifdef CONFIG_XEN
++#include <asm/nmi.h>
++#endif
   static inline void touch_nmi_watchdog(void)
   {
         touch_softlockup_watchdog();
diff --cc include/linux/oprofile.h

index 7f5cfd3,7f5cfd3..107d8eb
--- 1/include/linux/oprofile.h
--- 2/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@@ -19,6 -19,6 +19,9 @@@
   #include <linux/errno.h>
   #include <linux/printk.h>
   #include <asm/atomic.h>
++#ifdef CONFIG_XEN
++#include <xen/interface/xenoprof.h>
++#endif
    
   /* Each escaped entry is prefixed by ESCAPE_CODE
    * then one of the following codes, then the
@@@ -31,14 -31,14 +34,18 @@@
   #define CPU_SWITCH_CODE                       2
   #define COOKIE_SWITCH_CODE            3
   #define KERNEL_ENTER_SWITCH_CODE      4
--#define KERNEL_EXIT_SWITCH_CODE               5
++#define USER_ENTER_SWITCH_CODE                5
   #define MODULE_LOADED_CODE            6
   #define CTX_TGID_CODE                 7
   #define TRACE_BEGIN_CODE              8
   #define TRACE_END_CODE                        9
   #define XEN_ENTER_SWITCH_CODE         10
++#ifndef CONFIG_XEN
   #define SPU_PROFILING_CODE            11
   #define SPU_CTX_SWITCH_CODE           12
++#else
++#define DOMAIN_SWITCH_CODE            11
++#endif
   #define IBS_FETCH_CODE                        13
   #define IBS_OP_CODE                   14
   
@@@ -52,6 -52,6 +59,12 @@@ struct oprofile_operations 
         /* create any necessary configuration files in the oprofile fs.
          * Optional. */
         int (*create_files)(struct super_block * sb, struct dentry * root);
++#ifdef CONFIG_XEN
++      /* setup active domains with Xen */
++      int (*set_active)(int *active_domains, unsigned int adomains);
++      /* setup passive domains with Xen */
++      int (*set_passive)(int *passive_domains, unsigned int pdomains);
++#endif
         /* Do any necessary interrupt setup. Optional. */
         int (*setup)(void);
         /* Do any necessary interrupt shutdown. Optional. */
@@@ -117,9 -117,9 +130,14 @@@ void oprofile_add_ext_hw_sample(unsigne
    * backtrace. */
   void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event);
   
++void oprofile_add_mode(int cpu_mode);
++
   /* add a backtrace entry, to be called from the ->backtrace callback */
   void oprofile_add_trace(unsigned long eip);
   
++/* add a domain switch entry */
++int oprofile_add_domain_switch(int32_t domain_id);
++
   
   /**
    * Create a file of the given name as a child of the given root, with
diff --cc include/linux/page-flags.h

index 6081493,811183d..68c882c
--- 1/include/linux/page-flags.h
--- 2/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@@ -107,6 -107,6 +107,11 @@@ enum pageflags 
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         PG_compound_lock,
   #endif
++#ifdef CONFIG_XEN
++      PG_foreign,             /* Page is owned by foreign allocator. */
++      /* PG_netback,             Page is owned by netback */
++      PG_blkback,             /* Page is owned by blkback */
++#endif
         __NR_PAGEFLAGS,
   
         /* Filesystems */
@@@ -119,8 -119,8 +124,15 @@@
         PG_fscache = PG_private_2,      /* page backed by cache */
   
         /* XEN */
++#if defined(CONFIG_XEN)
++      PG_pinned = PG_locked,  /* Cannot alias with PG_owner_priv_1 since
++                               * bad_page() checks should include this bit.
++                               * Should not use PG_arch_1 as that may have
++                               * a different purpose elsewhere. */
++#elif defined(CONFIG_PARAVIRT_XEN)
         PG_pinned = PG_owner_priv_1,
         PG_savepinned = PG_dirty,
++#endif
   
         /* SLOB */
         PG_slob_free = PG_private,
@@@ -205,8 -205,8 +217,12 @@@ PAGEFLAG(Active, active) __CLEARPAGEFLA
         TESTCLEARFLAG(Active, active)
   __PAGEFLAG(Slab, slab)
   PAGEFLAG(Checked, checked)            /* Used by some filesystems */
++#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
   PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)   /* Xen */
++#endif
++#ifdef CONFIG_PARAVIRT_XEN
   PAGEFLAG(SavePinned, savepinned);                     /* Xen */
++#endif
   PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
   PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
   
@@@ -325,6 -325,6 +341,28 @@@ static inline void SetPageUptodate(stru
   
   CLEARPAGEFLAG(Uptodate, uptodate)
   
++#ifdef CONFIG_XEN
++TESTPAGEFLAG(Foreign, foreign)
++static inline void SetPageForeign(struct page *page,
++                                void (*dtor)(struct page *, unsigned int))
++{
++      BUG_ON(!dtor);
++      set_bit(PG_foreign, &page->flags);
++      page->index = (long)dtor;
++}
++static inline void ClearPageForeign(struct page *page)
++{
++      clear_bit(PG_foreign, &page->flags);
++      page->index = 0;
++}
++static inline void PageForeignDestructor(struct page *page, unsigned int order)
++{
++      ((void (*)(struct page *, unsigned int))page->index)(page, order);
++}
++/*PAGEFLAG(Netback, netback)*/
++PAGEFLAG(Blkback, blkback)
++#endif
++
   extern void cancel_dirty_page(struct page *page, unsigned int account_size);
   
   int test_clear_page_writeback(struct page *page);
@@@ -449,6 -449,6 +487,12 @@@ static inline int PageTransCompound(str
   #define __PG_COMPOUND_LOCK            0
   #endif
   
++#ifndef CONFIG_XEN
++# define __PG_XEN             0
++#else
++# define __PG_XEN             (1 << PG_foreign)
++#endif
++
   /*
    * Flags checked when a page is freed.  Pages being freed should not have
    * these flags set.  It they are, there is a problem.
@@@ -459,7 -459,7 +503,7 @@@
          1 << PG_writeback | 1 << PG_reserved | \
          1 << PG_slab    | 1 << PG_swapcache | 1 << PG_active | \
          1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
--       __PG_COMPOUND_LOCK)
++       __PG_COMPOUND_LOCK | __PG_XEN)
   
   /*
    * Flags checked when a page is prepped for return by the page allocator.
diff --cc include/linux/pci.h

index c446b5c,96f70d7..7ea3d9f
--- 1/include/linux/pci.h
--- 2/include/linux/pci.h
+++ b/include/linux/pci.h
@@@ -330,7 -325,7 +325,7 @@@ struct pci_dev 
         int rom_attr_enabled;           /* has display of the rom attribute been enabled? */
         struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
         struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
--#ifdef CONFIG_PCI_MSI
++#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
         struct list_head msi_list;
   #endif
         struct pci_vpd *vpd;
@@@ -801,6 -796,6 +796,9 @@@ int pci_reset_function(struct pci_dev *
   void pci_update_resource(struct pci_dev *dev, int resno);
   int __must_check pci_assign_resource(struct pci_dev *dev, int i);
   int pci_select_bars(struct pci_dev *dev, unsigned long flags);
++#ifdef CONFIG_XEN
++void pci_restore_bars(struct pci_dev *);
++#endif
   
   /* ROM control related routines */
   int pci_enable_rom(struct pci_dev *pdev);
@@@ -1008,6 -979,6 +982,11 @@@ static inline int pci_msi_enabled(void
   {
         return 0;
   }
++
++#ifdef CONFIG_XEN
++#define register_msi_get_owner(func) 0
++#define unregister_msi_get_owner(func) 0
++#endif
   #else
   extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
   extern void pci_msi_shutdown(struct pci_dev *dev);
@@@ -1020,6 -991,6 +999,10 @@@ extern void pci_disable_msix(struct pci
   extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
   extern void pci_restore_msi_state(struct pci_dev *dev);
   extern int pci_msi_enabled(void);
++#ifdef CONFIG_XEN
++extern int register_msi_get_owner(int (*func)(struct pci_dev *dev));
++extern int unregister_msi_get_owner(int (*func)(struct pci_dev *dev));
++#endif
   #endif
   
   #ifdef CONFIG_PCIEPORTBUS
@@@ -1589,5 -1543,5 +1555,11 @@@ int pci_vpd_find_tag(const u8 *buf, uns
   int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off,
                               unsigned int len, const char *kw);
   
++#ifdef CONFIG_PCI_GUESTDEV
++int pci_is_guestdev(struct pci_dev *dev);
++#else
++#define pci_is_guestdev(dev)  0
++#endif
++
   #endif /* __KERNEL__ */
   #endif /* LINUX_PCI_H */
diff --cc include/linux/precache.h

index 0000000,0000000..003275d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/linux/precache.h
@@@ -1,0 -1,0 +1,55 @@@
++#ifndef _LINUX_PRECACHE_H
++
++#include <linux/fs.h>
++#include <linux/mm.h>
++
++#ifdef CONFIG_PRECACHE
++extern void precache_init(struct super_block *sb);
++extern void shared_precache_init(struct super_block *sb, char *uuid);
++extern int precache_get(struct address_space *mapping, unsigned long index,
++             struct page *empty_page);
++extern int precache_put(struct address_space *mapping, unsigned long index,
++              struct page *page);
++extern int precache_flush(struct address_space *mapping, unsigned long index);
++extern int precache_flush_inode(struct address_space *mapping);
++extern int precache_flush_filesystem(struct super_block *s);
++#else
++static inline void precache_init(struct super_block *sb)
++{
++}
++
++static inline void shared_precache_init(struct super_block *sb, char *uuid)
++{
++}
++
++static inline int precache_get(struct address_space *mapping,
++              unsigned long index, struct page *empty_page)
++{
++      return 0;
++}
++
++static inline int precache_put(struct address_space *mapping,
++              unsigned long index, struct page *page)
++{
++      return 0;
++}
++
++static inline int precache_flush(struct address_space *mapping,
++              unsigned long index)
++{
++      return 0;
++}
++
++static inline int precache_flush_inode(struct address_space *mapping)
++{
++      return 0;
++}
++
++static inline int precache_flush_filesystem(struct super_block *s)
++{
++      return 0;
++}
++#endif
++
++#define _LINUX_PRECACHE_H
++#endif /* _LINUX_PRECACHE_H */
diff --cc include/linux/printk.h
Simple merge
diff --cc include/linux/swap.h

index 384eb5f,a5c6da5..b43d2e3
--- 1/include/linux/swap.h
--- 2/include/linux/swap.h
+++ b/include/linux/swap.h
@@@ -194,8 -194,8 +194,61 @@@ struct swap_info_struct 
         struct block_device *bdev;      /* swap device or bdev of swap file */
         struct file *swap_file;         /* seldom referenced */
         unsigned int old_block_size;    /* seldom referenced */
++#ifdef CONFIG_PRESWAP
++      unsigned long *preswap_map;
++      unsigned int preswap_pages;
++#endif
   };
   
++#ifdef CONFIG_PRESWAP
++
++#include <linux/sysctl.h>
++extern int preswap_sysctl_handler(struct ctl_table *, int, void __user *,
++      size_t *, loff_t *);
++extern const unsigned long preswap_zero, preswap_infinity;
++
++extern struct swap_info_struct *get_swap_info_struct(unsigned int type);
++
++extern void preswap_shrink(unsigned long);
++extern int preswap_test(struct swap_info_struct *, unsigned long);
++extern void preswap_init(unsigned);
++extern int preswap_put(struct page *);
++extern int preswap_get(struct page *);
++extern void preswap_flush(unsigned, unsigned long);
++extern void preswap_flush_area(unsigned);
++#else
++static inline void preswap_shrink(unsigned long target_pages)
++{
++}
++
++static inline int preswap_test(struct swap_info_struct *sis, unsigned long offset)
++{
++      return 0;
++}
++
++static inline void preswap_init(unsigned type)
++{
++}
++
++static inline int preswap_put(struct page *page)
++{
++      return 0;
++}
++
++static inline int preswap_get(struct page *get)
++{
++      return 0;
++}
++
++static inline void preswap_flush(unsigned type, unsigned long offset)
++{
++}
++
++static inline void preswap_flush_area(unsigned type)
++{
++}
++#endif /* CONFIG_PRESWAP */
++
   struct swap_list_t {
         int head;       /* head of priority-ordered swapfile list */
         int next;       /* swapfile to be used next */
diff --cc include/linux/sysctl.h

index 11684d9,11684d9..7ddd993
--- 1/include/linux/sysctl.h
--- 2/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@@ -59,6 -59,6 +59,7 @@@ enu
         CTL_BUS=8,              /* Busses */
         CTL_ABI=9,              /* Binary emulation */
         CTL_CPU=10,             /* CPU stuff (speed scaling, etc) */
++      CTL_XEN=123,            /* Xen info and control */
         CTL_ARLAN=254,          /* arlan wireless driver */
         CTL_S390DBF=5677,       /* s390 debug */
         CTL_SUNRPC=7249,        /* sunrpc debug */
diff --cc include/linux/vermagic.h

index cf97b5b,cf97b5b..709d165
--- 1/include/linux/vermagic.h
--- 2/include/linux/vermagic.h
+++ b/include/linux/vermagic.h
@@@ -22,6 -22,6 +22,11 @@@
   #else
   #define MODULE_VERMAGIC_MODVERSIONS ""
   #endif
++#ifdef CONFIG_XEN
++#define MODULE_VERMAGIC_XEN "Xen "
++#else
++#define MODULE_VERMAGIC_XEN
++#endif
   #ifndef MODULE_ARCH_VERMAGIC
   #define MODULE_ARCH_VERMAGIC ""
   #endif
@@@ -30,5 -30,5 +35,5 @@@
         UTS_RELEASE " "                                                 \
         MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT                     \
         MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS       \
--      MODULE_ARCH_VERMAGIC
++      MODULE_VERMAGIC_XEN MODULE_ARCH_VERMAGIC
   
diff --cc include/xen/Kbuild

index 84ad8f0,84ad8f0..e4a826b
--- 1/include/xen/Kbuild
--- 2/include/xen/Kbuild
+++ b/include/xen/Kbuild
@@@ -1,2 -1,2 +1,1 @@@
--header-y += evtchn.h
--header-y += privcmd.h
++header-y += public/
diff --cc include/xen/balloon.h

index a2b22f0,a2b22f0..6402b39
--- 1/include/xen/balloon.h
--- 2/include/xen/balloon.h
+++ b/include/xen/balloon.h
@@@ -1,7 -1,7 +1,68 @@@
   /******************************************************************************
-- * Xen balloon functionality
++ * balloon.h
++ *
++ * Xen balloon driver - enables returning/claiming memory to/from Xen.
++ *
++ * Copyright (c) 2003, B Dragovic
++ * Copyright (c) 2003-2004, M Williamson, K Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
    */
   
++#ifndef __XEN_BALLOON_H__
++#define __XEN_BALLOON_H__
++
++#include <linux/spinlock.h>
++
++#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
++/*
++ * Inform the balloon driver that it should allow some slop for device-driver
++ * memory activities.
++ */
++void balloon_update_driver_allowance(long delta);
++
++/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */
++struct page **alloc_empty_pages_and_pagevec(int nr_pages);
++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages);
++
++/* Free an empty page range (not allocated through
++   alloc_empty_pages_and_pagevec), adding to the balloon. */
++void free_empty_pages(struct page **pagevec, int nr_pages);
++
++void balloon_release_driver_page(struct page *page);
++
++/*
++ * Prevent the balloon driver from changing the memory reservation during
++ * a driver critical region.
++ */
++extern spinlock_t balloon_lock;
++#define balloon_lock(__flags)   spin_lock_irqsave(&balloon_lock, __flags)
++#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
++
++#else /* CONFIG_PARAVIRT_XEN */
++
   #define RETRY_UNLIMITED       0
   
   struct balloon_stats {
@@@ -23,3 -23,3 +84,7 @@@ void balloon_set_new_target(unsigned lo
   
   int alloc_xenballooned_pages(int nr_pages, struct page** pages);
   void free_xenballooned_pages(int nr_pages, struct page** pages);
++
++#endif /* CONFIG_PARAVIRT_XEN */
++
++#endif /* __XEN_BALLOON_H__ */
diff --cc include/xen/blkif.h

index 0000000,0000000..0ea115d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/blkif.h
@@@ -1,0 -1,0 +1,123 @@@
++/* 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_BLKIF_H__
++#define __XEN_BLKIF_H__
++
++#include <xen/interface/io/ring.h>
++#include <xen/interface/io/blkif.h>
++#include <xen/interface/io/protocols.h>
++
++/* Not a real protocol.  Used to generate ring structs which contain
++ * the elements common to all protocols only.  This way we get a
++ * compiler-checkable way to use common struct elements, so we can
++ * avoid using switch(protocol) in a number of places.  */
++struct blkif_common_request {
++      char dummy;
++};
++struct blkif_common_response {
++      char dummy;
++};
++
++/* i386 protocol version */
++#pragma pack(push, 4)
++struct blkif_x86_32_request {
++      uint8_t        operation;    /* BLKIF_OP_???                         */
++      uint8_t        nr_segments;  /* number of segments                   */
++      blkif_vdev_t   handle;       /* only for read/write requests         */
++      uint64_t       id;           /* private guest value, echoed in resp  */
++      blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
++      struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++};
++struct blkif_x86_32_response {
++      uint64_t        id;              /* copied from request */
++      uint8_t         operation;       /* copied from request */
++      int16_t         status;          /* BLKIF_RSP_???       */
++};
++typedef struct blkif_x86_32_request blkif_x86_32_request_t;
++typedef struct blkif_x86_32_response blkif_x86_32_response_t;
++#pragma pack(pop)
++
++/* x86_64 protocol version */
++struct blkif_x86_64_request {
++      uint8_t        operation;    /* BLKIF_OP_???                         */
++      uint8_t        nr_segments;  /* number of segments                   */
++      blkif_vdev_t   handle;       /* only for read/write requests         */
++      uint64_t       __attribute__((__aligned__(8))) id;
++      blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
++      struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++};
++struct blkif_x86_64_response {
++      uint64_t       __attribute__((__aligned__(8))) id;
++      uint8_t         operation;       /* copied from request */
++      int16_t         status;          /* BLKIF_RSP_???       */
++};
++typedef struct blkif_x86_64_request blkif_x86_64_request_t;
++typedef struct blkif_x86_64_response blkif_x86_64_response_t;
++
++DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
++DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
++DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
++
++union blkif_back_rings {
++      blkif_back_ring_t        native;
++      blkif_common_back_ring_t common;
++      blkif_x86_32_back_ring_t x86_32;
++      blkif_x86_64_back_ring_t x86_64;
++};
++typedef union blkif_back_rings blkif_back_rings_t;
++
++enum blkif_protocol {
++      BLKIF_PROTOCOL_NATIVE = 1,
++      BLKIF_PROTOCOL_X86_32 = 2,
++      BLKIF_PROTOCOL_X86_64 = 3,
++};
++
++static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
++{
++      int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
++      dst->operation = src->operation;
++      dst->nr_segments = src->nr_segments;
++      dst->handle = src->handle;
++      dst->id = src->id;
++      dst->sector_number = src->sector_number;
++      barrier();
++      if (n > dst->nr_segments)
++              n = dst->nr_segments;
++      for (i = 0; i < n; i++)
++              dst->seg[i] = src->seg[i];
++}
++
++static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
++{
++      int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
++      dst->operation = src->operation;
++      dst->nr_segments = src->nr_segments;
++      dst->handle = src->handle;
++      dst->id = src->id;
++      dst->sector_number = src->sector_number;
++      barrier();
++      if (n > dst->nr_segments)
++              n = dst->nr_segments;
++      for (i = 0; i < n; i++)
++              dst->seg[i] = src->seg[i];
++}
++
++#endif /* __XEN_BLKIF_H__ */
diff --cc include/xen/clock.h

index 0000000,0000000..935b433

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/clock.h
@@@ -1,0 -1,0 +1,19 @@@
++#ifndef __XEN_CPU_CLOCK_H__
++#define __XEN_CPU_CLOCK_H__
++
++struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu);
++void get_runstate_snapshot(struct vcpu_runstate_info *);
++
++unsigned long long xen_local_clock(void);
++void xen_check_wallclock_update(void);
++
++#ifdef CONFIG_GENERIC_CLOCKEVENTS
++void xen_clockevents_init(void);
++void xen_setup_cpu_clockevents(void);
++void xen_clockevents_resume(void);
++#else
++static inline void xen_setup_cpu_clockevents(void) {}
++static inline void xen_clockevents_resume(void) {}
++#endif
++
++#endif /* __XEN_CPU_CLOCK_H__ */
diff --cc include/xen/compat_ioctl.h

index 0000000,0000000..975afb6

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/compat_ioctl.h
@@@ -1,0 -1,0 +1,75 @@@
++/*
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License as
++ * published by the Free Software Foundation; either version 2 of the
++ * License, or (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
++ *
++ * Copyright IBM Corp. 2007
++ *
++ * Authors: Jimi Xenidis <jimix@watson.ibm.com>
++ *          Hollis Blanchard <hollisb@us.ibm.com>
++ */
++
++#ifndef __LINUX_XEN_COMPAT_H__ 
++#define __LINUX_XEN_COMPAT_H__ 
++
++#include <linux/compat.h>
++#include <linux/compiler.h>
++
++#if defined(CONFIG_X86) || defined(CONFIG_IA64)
++#define xen_pfn32_t __u32
++#endif
++
++extern int privcmd_ioctl_32(int fd, unsigned int cmd, void __user *arg);
++struct privcmd_mmap_32 {
++      int num;
++      domid_t dom;
++      compat_uptr_t entry;
++};
++
++struct privcmd_mmapbatch_32 {
++      int num;     /* number of pages to populate */
++      domid_t dom; /* target domain */
++#if defined(CONFIG_X86) || defined(CONFIG_IA64)
++      union {      /* virtual address */
++              __u64 addr __attribute__((packed));
++              __u32 va; /* ensures union is 4-byte aligned */
++      };
++#else
++      __u64 addr;  /* virtual address */
++#endif
++      compat_uptr_t arr; /* array of mfns - top nibble set on err */
++};
++
++struct privcmd_mmapbatch_v2_32 {
++      unsigned int num; /* number of pages to populate */
++      domid_t dom;      /* target domain */
++#if defined(CONFIG_X86) || defined(CONFIG_IA64)
++      union {      /* virtual address */
++              __u64 addr __attribute__((packed));
++              __u32 va; /* ensures union is 4-byte aligned */
++      };
++#else
++      __u64 addr;  /* virtual address */
++#endif
++      compat_uptr_t arr; /* array of mfns */
++      compat_uptr_t err; /* array of error codes */
++};
++
++#define IOCTL_PRIVCMD_MMAP_32                   \
++      _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap_32))
++#define IOCTL_PRIVCMD_MMAPBATCH_32              \
++      _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch_32))
++#define IOCTL_PRIVCMD_MMAPBATCH_V2_32           \
++      _IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2_32))
++
++#endif /* __LINUX_XEN_COMPAT_H__ */
diff --cc include/xen/cpu_hotplug.h

index 0000000,0000000..9c0f5b8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/cpu_hotplug.h
@@@ -1,0 -1,0 +1,39 @@@
++#ifndef __XEN_CPU_HOTPLUG_H__
++#define __XEN_CPU_HOTPLUG_H__
++
++#include <linux/kernel.h>
++#include <linux/cpumask.h>
++
++#if defined(CONFIG_X86) && defined(CONFIG_SMP)
++extern cpumask_var_t vcpu_initialized_mask;
++#endif
++
++#if defined(CONFIG_HOTPLUG_CPU)
++
++int cpu_up_check(unsigned int cpu);
++void init_xenbus_allowed_cpumask(void);
++int smp_suspend(void);
++void smp_resume(void);
++
++#else /* !defined(CONFIG_HOTPLUG_CPU) */
++
++#define cpu_up_check(cpu)             (0)
++#define init_xenbus_allowed_cpumask() ((void)0)
++
++static inline int smp_suspend(void)
++{
++      if (num_online_cpus() > 1) {
++              pr_warning("Can't suspend SMP guests without"
++                         " CONFIG_HOTPLUG_CPU\n");
++              return -EOPNOTSUPP;
++      }
++      return 0;
++}
++
++static inline void smp_resume(void)
++{
++}
++
++#endif /* !defined(CONFIG_HOTPLUG_CPU) */
++
++#endif /* __XEN_CPU_HOTPLUG_H__ */
diff --cc include/xen/driver_util.h

index 0000000,0000000..12d10f7

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/driver_util.h
@@@ -1,0 -1,0 +1,14 @@@
++#ifndef __XEN_DRIVER_UTIL_H__
++#define __XEN_DRIVER_UTIL_H__
++
++#include <linux/compiler.h>
++#include <linux/device.h>
++
++extern struct class *get_xen_class(void);
++extern struct device *xen_class_device_create(struct device_type *,
++                                            struct device *parent,
++                                            dev_t devt, void *drvdata,
++                                            const char *fmt, ...)
++                    __printf(5, 6);
++
++#endif /* __XEN_DRIVER_UTIL_H__ */
diff --cc include/xen/evtchn.h

index 14e833e,14e833e..710465e
--- 1/include/xen/evtchn.h
--- 2/include/xen/evtchn.h
+++ b/include/xen/evtchn.h
@@@ -1,7 -1,7 +1,11 @@@
++#if defined(CONFIG_PARAVIRT_XEN) || !defined(__KERNEL__)
++#include "public/evtchn.h"
++#else
   /******************************************************************************
    * evtchn.h
    *
-- * Interface to /dev/xen/evtchn.
++ * Communication via Xen event channels.
++ * Also definitions for the device that demuxes notifications to userspace.
    *
    * Copyright (c) 2003-2005, K A Fraser
    *
@@@ -30,59 -30,59 +34,198 @@@
    * IN THE SOFTWARE.
    */
   
--#ifndef __LINUX_PUBLIC_EVTCHN_H__
--#define __LINUX_PUBLIC_EVTCHN_H__
++#ifndef __ASM_EVTCHN_H__
++#define __ASM_EVTCHN_H__
   
--/*
-- * Bind a fresh port to VIRQ @virq.
-- * Return allocated port.
-- */
--#define IOCTL_EVTCHN_BIND_VIRQ                                \
--      _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
--struct ioctl_evtchn_bind_virq {
--      unsigned int virq;
--};
++#include <linux/interrupt.h>
++#include <asm/hypervisor.h>
++#include <asm/ptrace.h>
++#include <asm/synch_bitops.h>
++#include <xen/interface/event_channel.h>
++#include <linux/smp.h>
   
   /*
-- * Bind a fresh port to remote <@remote_domain, @remote_port>.
-- * Return allocated port.
++ * LOW-LEVEL DEFINITIONS
    */
--#define IOCTL_EVTCHN_BIND_INTERDOMAIN                 \
--      _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
--struct ioctl_evtchn_bind_interdomain {
--      unsigned int remote_domain, remote_port;
++
++#ifdef CONFIG_XEN
++struct irq_cfg {
++      u32 info;
++      union {
++              int bindcount; /* for dynamic IRQs */
++#ifdef CONFIG_X86_IO_APIC
++              u8 vector; /* for physical IRQs */
++#endif
++      };
   };
++struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
++#endif
   
   /*
-- * Allocate a fresh port for binding to @remote_domain.
-- * Return allocated port.
++ * Dynamically bind an event source to an IRQ-like callback handler.
++ * On some platforms this may not be implemented via the Linux IRQ subsystem.
++ * The IRQ argument passed to the callback handler is the same as returned
++ * from the bind call. It may not correspond to a Linux IRQ number.
++ * Returns IRQ or negative errno.
    */
--#define IOCTL_EVTCHN_BIND_UNBOUND_PORT                        \
--      _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
--struct ioctl_evtchn_bind_unbound_port {
--      unsigned int remote_domain;
--};
++int bind_caller_port_to_irqhandler(
++      unsigned int caller_port,
++      irq_handler_t handler,
++      unsigned long irqflags,
++      const char *devname,
++      void *dev_id);
++int bind_listening_port_to_irqhandler(
++      unsigned int remote_domain,
++      irq_handler_t handler,
++      unsigned long irqflags,
++      const char *devname,
++      void *dev_id);
++int bind_interdomain_evtchn_to_irqhandler(
++      unsigned int remote_domain,
++      unsigned int remote_port,
++      irq_handler_t handler,
++      unsigned long irqflags,
++      const char *devname,
++      void *dev_id);
++int bind_virq_to_irqhandler(
++      unsigned int virq,
++      unsigned int cpu,
++      irq_handler_t handler,
++      unsigned long irqflags,
++      const char *devname,
++      void *dev_id);
++#if defined(CONFIG_SMP) && defined(CONFIG_XEN) && defined(CONFIG_X86)
++int bind_virq_to_irqaction(
++      unsigned int virq,
++      unsigned int cpu,
++      struct irqaction *action);
++#else
++#define bind_virq_to_irqaction(virq, cpu, action) \
++      bind_virq_to_irqhandler(virq, cpu, (action)->handler, \
++                              (action)->flags | IRQF_NOBALANCING, \
++                              (action)->name, action)
++#endif
++#if defined(CONFIG_SMP) && !defined(MODULE)
++#ifndef CONFIG_X86
++int bind_ipi_to_irqhandler(
++      unsigned int ipi,
++      unsigned int cpu,
++      irq_handler_t handler,
++      unsigned long irqflags,
++      const char *devname,
++      void *dev_id);
++#else
++int bind_ipi_to_irqaction(
++      unsigned int cpu,
++      struct irqaction *action);
++DECLARE_PER_CPU(DECLARE_BITMAP(, NR_IPIS), ipi_pending);
++#endif
++#endif
   
   /*
-- * Unbind previously allocated @port.
++ * Common unbind function for all event sources. Takes IRQ to unbind from.
++ * Automatically closes the underlying event channel (except for bindings
++ * made with bind_caller_port_to_irqhandler()).
    */
--#define IOCTL_EVTCHN_UNBIND                           \
--      _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
--struct ioctl_evtchn_unbind {
--      unsigned int port;
--};
++void unbind_from_irqhandler(unsigned int irq, void *dev_id);
++
++#if defined(CONFIG_SMP) && defined(CONFIG_XEN) && defined(CONFIG_X86)
++/* Specialized unbind function for per-CPU IRQs. */
++void unbind_from_per_cpu_irq(unsigned int irq, unsigned int cpu,
++                           struct irqaction *);
++#else
++#define unbind_from_per_cpu_irq(irq, cpu, action) \
++      unbind_from_irqhandler(irq, action)
++#endif
++
++#ifndef CONFIG_XEN
++void irq_resume(void);
++#endif
++
++/* Entry point for notifications into Linux subsystems. */
++asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
++
++/* Mark a PIRQ as unavailable for dynamic allocation. */
++void evtchn_register_pirq(int irq);
++/* Map a Xen-supplied PIRQ to a dynamically allocated one. */
++int evtchn_map_pirq(int irq, int xen_pirq);
++/* Look up a Xen-supplied PIRQ for a dynamically allocated one. */
++int evtchn_get_xen_pirq(int irq);
++
++void mask_evtchn(int port);
++void disable_all_local_evtchn(void);
++void unmask_evtchn(int port);
++unsigned int irq_from_evtchn(unsigned int port);
++
++#ifdef CONFIG_SMP
++void rebind_evtchn_to_cpu(int port, unsigned int cpu);
++#else
++#define rebind_evtchn_to_cpu(port, cpu)       ((void)0)
++#endif
++
++static inline int test_and_set_evtchn_mask(int port)
++{
++      shared_info_t *s = HYPERVISOR_shared_info;
++      return synch_test_and_set_bit(port, s->evtchn_mask);
++}
++
++static inline void clear_evtchn(int port)
++{
++      shared_info_t *s = HYPERVISOR_shared_info;
++      synch_clear_bit(port, s->evtchn_pending);
++}
++
++static inline void set_evtchn(int port)
++{
++      shared_info_t *s = HYPERVISOR_shared_info;
++      synch_set_bit(port, s->evtchn_pending);
++}
++
++static inline int test_evtchn(int port)
++{
++      shared_info_t *s = HYPERVISOR_shared_info;
++      return synch_test_bit(port, s->evtchn_pending);
++}
++
++static inline void notify_remote_via_evtchn(int port)
++{
++      struct evtchn_send send = { .port = port };
++      VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
++}
++
++static inline void
++multi_notify_remote_via_evtchn(multicall_entry_t *mcl, int port)
++{
++      struct evtchn_send *send = (void *)(mcl->args + 2);
++
++      BUILD_BUG_ON(sizeof(*send) > sizeof(mcl->args) - 2 * sizeof(*mcl->args));
++      send->port = port;
++      mcl->op = __HYPERVISOR_event_channel_op;
++      mcl->args[0] = EVTCHNOP_send;
++      mcl->args[1] = (unsigned long)send;
++}
++
++static inline int close_evtchn(int port)
++{
++      struct evtchn_close close = { .port = port };
++      return HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
++}
++
++/* Test an irq's pending state. */
++int xen_test_irq_pending(int irq);
   
   /*
-- * Unbind previously allocated @port.
++ * Use these to access the event channel underlying the IRQ handle returned
++ * by bind_*_to_irqhandler().
    */
--#define IOCTL_EVTCHN_NOTIFY                           \
--      _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
--struct ioctl_evtchn_notify {
--      unsigned int port;
--};
++void notify_remote_via_irq(int irq);
++int multi_notify_remote_via_irq(multicall_entry_t *, int irq);
++int irq_to_evtchn_port(int irq);
   
--/* Clear and reinitialise the event buffer. Clear error condition. */
--#define IOCTL_EVTCHN_RESET                            \
--      _IOC(_IOC_NONE, 'E', 5, 0)
++#if defined(CONFIG_SMP) && !defined(MODULE) && defined(CONFIG_X86)
++void notify_remote_via_ipi(unsigned int ipi, unsigned int cpu);
++void clear_ipi_evtchn(void);
++#endif
   
--#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
++#endif /* __ASM_EVTCHN_H__ */
++#endif /* CONFIG_PARAVIRT_XEN */
diff --cc include/xen/features.h

index 27292d4,27292d4..6c89605
--- 1/include/xen/features.h
--- 2/include/xen/features.h
+++ b/include/xen/features.h
@@@ -10,6 -10,6 +10,7 @@@
   #define __XEN_FEATURES_H__
   
   #include <xen/interface/features.h>
++#include <xen/interface/version.h>
   
   void xen_setup_features(void);
   
@@@ -20,4 -20,4 +21,4 @@@ static inline int xen_feature(int flag
         return xen_features[flag];
   }
   
--#endif /* __ASM_XEN_FEATURES_H__ */
++#endif /* __XEN_FEATURES_H__ */
diff --cc include/xen/firmware.h

index 0000000,0000000..3be378c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/firmware.h
@@@ -1,0 -1,0 +1,14 @@@
++#ifndef __XEN_FIRMWARE_H__
++#define __XEN_FIRMWARE_H__
++
++#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
++void copy_edd(void);
++#endif
++
++#ifdef CONFIG_XEN_PRIVILEGED_GUEST
++void copy_edid(void);
++#else
++static inline void copy_edid(void) {}
++#endif
++
++#endif /* __XEN_FIRMWARE_H__ */
diff --cc include/xen/gntdev.h

index 5304bd3,5304bd3..ce4936d
--- 1/include/xen/gntdev.h
--- 2/include/xen/gntdev.h
+++ b/include/xen/gntdev.h
@@@ -1,150 -1,150 +1,3 @@@
--/******************************************************************************
-- * gntdev.h
-- * 
-- * Interface to /dev/xen/gntdev.
-- * 
-- * Copyright (c) 2007, D G Murray
-- * 
-- * This program is free software; you can redistribute it and/or
-- * modify it under the terms of the GNU General Public License version 2
-- * as published by the Free Software Foundation; or, when distributed
-- * separately from the Linux kernel or incorporated into other
-- * software packages, subject to the following license:
-- * 
-- * Permission is hereby granted, free of charge, to any person obtaining a copy
-- * of this source file (the "Software"), to deal in the Software without
-- * restriction, including without limitation the rights to use, copy, modify,
-- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
-- * and to permit persons to whom the Software is furnished to do so, subject to
-- * the following conditions:
-- * 
-- * The above copyright notice and this permission notice shall be included in
-- * all copies or substantial portions of the Software.
-- * 
-- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-- * IN THE SOFTWARE.
-- */
--
--#ifndef __LINUX_PUBLIC_GNTDEV_H__
--#define __LINUX_PUBLIC_GNTDEV_H__
--
--struct ioctl_gntdev_grant_ref {
--      /* The domain ID of the grant to be mapped. */
--      uint32_t domid;
--      /* The grant reference of the grant to be mapped. */
--      uint32_t ref;
--};
--
--/*
-- * Inserts the grant references into the mapping table of an instance
-- * of gntdev. N.B. This does not perform the mapping, which is deferred
-- * until mmap() is called with @index as the offset.
-- */
--#define IOCTL_GNTDEV_MAP_GRANT_REF \
--_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
--struct ioctl_gntdev_map_grant_ref {
--      /* IN parameters */
--      /* The number of grants to be mapped. */
--      uint32_t count;
--      uint32_t pad;
--      /* OUT parameters */
--      /* The offset to be used on a subsequent call to mmap(). */
--      uint64_t index;
--      /* Variable IN parameter. */
--      /* Array of grant references, of size @count. */
--      struct ioctl_gntdev_grant_ref refs[1];
--};
--
--/*
-- * Removes the grant references from the mapping table of an instance of
-- * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
-- * before this ioctl is called, or an error will result.
-- */
--#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
--_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
--struct ioctl_gntdev_unmap_grant_ref {
--      /* IN parameters */
--      /* The offset was returned by the corresponding map operation. */
--      uint64_t index;
--      /* The number of pages to be unmapped. */
--      uint32_t count;
--      uint32_t pad;
--};
--
--/*
-- * Returns the offset in the driver's address space that corresponds
-- * to @vaddr. This can be used to perform a munmap(), followed by an
-- * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
-- * the caller. The number of pages that were allocated at the same time as
-- * @vaddr is returned in @count.
-- *
-- * N.B. Where more than one page has been mapped into a contiguous range, the
-- *      supplied @vaddr must correspond to the start of the range; otherwise
-- *      an error will result. It is only possible to munmap() the entire
-- *      contiguously-allocated range at once, and not any subrange thereof.
-- */
--#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
--_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
--struct ioctl_gntdev_get_offset_for_vaddr {
--      /* IN parameters */
--      /* The virtual address of the first mapped page in a range. */
--      uint64_t vaddr;
--      /* OUT parameters */
--      /* The offset that was used in the initial mmap() operation. */
--      uint64_t offset;
--      /* The number of pages mapped in the VM area that begins at @vaddr. */
--      uint32_t count;
--      uint32_t pad;
--};
--
--/*
-- * Sets the maximum number of grants that may mapped at once by this gntdev
-- * instance.
-- *
-- * N.B. This must be called before any other ioctl is performed on the device.
-- */
--#define IOCTL_GNTDEV_SET_MAX_GRANTS \
--_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
--struct ioctl_gntdev_set_max_grants {
--      /* IN parameter */
--      /* The maximum number of grants that may be mapped at once. */
--      uint32_t count;
--};
--
--/*
-- * Sets up an unmap notification within the page, so that the other side can do
-- * cleanup if this side crashes. Required to implement cross-domain robust
-- * mutexes or close notification on communication channels.
-- *
-- * Each mapped page only supports one notification; multiple calls referring to
-- * the same page overwrite the previous notification. You must clear the
-- * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it
-- * to occur.
-- */
--#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \
--_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntdev_unmap_notify))
--struct ioctl_gntdev_unmap_notify {
--      /* IN parameters */
--      /* Offset in the file descriptor for a byte within the page (same as
--       * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to
--       * be cleared. Otherwise, it can be any byte in the page whose
--       * notification we are adjusting.
--       */
--      uint64_t index;
--      /* Action(s) to take on unmap */
--      uint32_t action;
--      /* Event channel to notify */
--      uint32_t event_channel_port;
--};
--
--/* Clear (set to zero) the byte specified by index */
--#define UNMAP_NOTIFY_CLEAR_BYTE 0x1
--/* Send an interrupt on the indicated event channel */
--#define UNMAP_NOTIFY_SEND_EVENT 0x2
--
--#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
++#if defined(CONFIG_PARAVIRT_XEN) || !defined(__KERNEL__)
++#include "public/gntdev.h"
++#endif
diff --cc include/xen/gnttab.h

index 0000000,0000000..be3d2f3

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/gnttab.h
@@@ -1,0 -1,0 +1,207 @@@
++/******************************************************************************
++ * gnttab.h
++ * 
++ * Two sets of functionality:
++ * 1. Granting foreign access to our memory reservation.
++ * 2. Accessing others' memory reservations via grant references.
++ * (i.e., mechanisms for both sender and recipient of grant references)
++ * 
++ * Copyright (c) 2004-2005, K A Fraser
++ * Copyright (c) 2005, Christopher Clark
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __ASM_GNTTAB_H__
++#define __ASM_GNTTAB_H__
++
++#include <asm/hypervisor.h>
++#include <asm/maddr.h> /* maddr_t */
++#include <linux/mm.h>
++#include <linux/delay.h>
++#include <xen/interface/grant_table.h>
++#include <xen/features.h>
++
++struct gnttab_free_callback {
++      struct gnttab_free_callback *next;
++      void (*fn)(void *);
++      void *arg;
++      u16 count;
++      u8 queued;
++};
++
++int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
++                              int flags);
++
++/*
++ * End access through the given grant reference, iff the grant entry is no
++ * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
++ * use.
++ */
++int gnttab_end_foreign_access_ref(grant_ref_t ref);
++
++/*
++ * Eventually end access through the given grant reference, and once that
++ * access has been ended, free the given page too.  Access will be ended
++ * immediately iff the grant entry is not in use, otherwise it will happen
++ * some time later.  page may be 0, in which case no freeing will occur.
++ */
++void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page);
++
++int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
++
++unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
++unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
++
++int gnttab_query_foreign_access(grant_ref_t ref);
++
++/*
++ * operations on reserved batches of grant references
++ */
++int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
++
++void gnttab_free_grant_reference(grant_ref_t ref);
++
++void gnttab_free_grant_references(grant_ref_t head);
++
++int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
++
++int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
++
++void gnttab_release_grant_reference(grant_ref_t *private_head,
++                                  grant_ref_t release);
++
++void gnttab_request_free_callback(struct gnttab_free_callback *callback,
++                                void (*fn)(void *), void *arg, u16 count);
++void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
++
++void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
++                                   unsigned long frame, int flags);
++
++void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
++                                     unsigned long pfn);
++
++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep);
++#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
++void __gnttab_dma_map_page(struct page *page);
++#else
++#define __gnttab_dma_map_page __gnttab_dma_unmap_page
++#endif
++static inline void __gnttab_dma_unmap_page(struct page *page)
++{
++}
++
++void gnttab_reset_grant_page(struct page *page);
++
++#ifndef CONFIG_XEN
++int gnttab_resume(void);
++#endif
++
++void *arch_gnttab_alloc_shared(unsigned long *frames);
++
++static inline void
++gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr,
++                uint32_t flags, grant_ref_t ref, domid_t domid)
++{
++      if (flags & GNTMAP_contains_pte)
++              map->host_addr = addr;
++      else if (xen_feature(XENFEAT_auto_translated_physmap))
++              map->host_addr = __pa(addr);
++      else
++              map->host_addr = addr;
++
++      map->flags = flags;
++      map->ref = ref;
++      map->dom = domid;
++}
++
++static inline void
++gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr,
++                  uint32_t flags, grant_handle_t handle)
++{
++      if (flags & GNTMAP_contains_pte)
++              unmap->host_addr = addr;
++      else if (xen_feature(XENFEAT_auto_translated_physmap))
++              unmap->host_addr = __pa(addr);
++      else
++              unmap->host_addr = addr;
++
++      unmap->handle = handle;
++      unmap->dev_bus_addr = 0;
++}
++
++static inline void
++gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, maddr_t addr,
++                    maddr_t new_addr, grant_handle_t handle)
++{
++      if (xen_feature(XENFEAT_auto_translated_physmap)) {
++              unmap->host_addr = __pa(addr);
++              unmap->new_addr = __pa(new_addr);
++      } else {
++              unmap->host_addr = addr;
++              unmap->new_addr = new_addr;
++      }
++
++      unmap->handle = handle;
++}
++
++#define gnttab_check_GNTST_eagain_while(__HCop, __HCarg_p)                    \
++{                                                                             \
++      u8 __hc_delay = 1;                                                      \
++      int __ret;                                                              \
++      while (unlikely((__HCarg_p)->status == GNTST_eagain && __hc_delay)) {   \
++              msleep(__hc_delay++);                                           \
++              __ret = HYPERVISOR_grant_table_op(__HCop, (__HCarg_p), 1);      \
++              BUG_ON(__ret);                                                  \
++      }                                                                       \
++      if (__hc_delay == 0) {                                                  \
++              pr_err("%s: %s gnt busy\n", __func__, current->comm);           \
++              (__HCarg_p)->status = GNTST_bad_page;                           \
++      }                                                                       \
++      if ((__HCarg_p)->status != GNTST_okay)                                  \
++              pr_err("%s: %s gnt status %x\n",                                \
++                      __func__, current->comm, (__HCarg_p)->status);          \
++}
++
++#define gnttab_check_GNTST_eagain_do_while(__HCop, __HCarg_p)                 \
++{                                                                             \
++      u8 __hc_delay = 1;                                                      \
++      int __ret;                                                              \
++      do {                                                                    \
++              __ret = HYPERVISOR_grant_table_op(__HCop, (__HCarg_p), 1);      \
++              BUG_ON(__ret);                                                  \
++              if ((__HCarg_p)->status == GNTST_eagain)                        \
++                      msleep(__hc_delay++);                                   \
++      } while ((__HCarg_p)->status == GNTST_eagain && __hc_delay);            \
++      if (__hc_delay == 0) {                                                  \
++              pr_err("%s: %s gnt busy\n", __func__, current->comm);           \
++              (__HCarg_p)->status = GNTST_bad_page;                           \
++      }                                                                       \
++      if ((__HCarg_p)->status != GNTST_okay)                                  \
++              pr_err("%s: %s gnt status %x\n",                                \
++                      __func__, current->comm, (__HCarg_p)->status);          \
++}
++
++#endif /* __ASM_GNTTAB_H__ */
diff --cc include/xen/hvm.h

index b193fa2,b193fa2..b883740
--- 1/include/xen/hvm.h
--- 2/include/xen/hvm.h
+++ b/include/xen/hvm.h
@@@ -3,7 -3,7 +3,9 @@@
   #define XEN_HVM_H__
   
   #include <xen/interface/hvm/params.h>
++#ifndef HAVE_XEN_PLATFORM_COMPAT_H
   #include <asm/xen/hypercall.h>
++#endif
   
   static inline int hvm_get_parameter(int idx, uint64_t *value)
   {
@@@ -14,8 -14,8 +16,7 @@@
         xhv.index = idx;
         r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
         if (r < 0) {
--              printk(KERN_ERR "Cannot get hvm parameter %d: %d!\n",
--                      idx, r);
++              pr_err("Cannot get hvm parameter %d: %d!\n", idx, r);
                 return r;
         }
         *value = xhv.value;
diff --cc include/xen/hypercall.h

index 0000000,0000000..62071ea

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/hypercall.h
@@@ -1,0 -1,0 +1,30 @@@
++#ifndef __XEN_HYPERCALL_H__
++#define __XEN_HYPERCALL_H__
++
++#include <asm/hypercall.h>
++
++static inline int __must_check
++HYPERVISOR_multicall_check(
++      multicall_entry_t *call_list, unsigned int nr_calls,
++      const unsigned long *rc_list)
++{
++      int rc = HYPERVISOR_multicall(call_list, nr_calls);
++
++      if (unlikely(rc < 0))
++              return rc;
++      BUG_ON(rc);
++      BUG_ON((int)nr_calls < 0);
++
++      for ( ; nr_calls > 0; --nr_calls, ++call_list)
++              if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0)))
++                      return nr_calls;
++
++      return 0;
++}
++
++/* A construct to ignore the return value of hypercall wrappers in a few
++ * exceptional cases (simply casting the function result to void doesn't
++ * avoid the compiler warning): */
++#define VOID(expr) ((void)((expr)?:0))
++
++#endif /* __XEN_HYPERCALL_H__ */
diff --cc include/xen/hypervisor_sysfs.h

index 0000000,0000000..fed48a1

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/hypervisor_sysfs.h
@@@ -1,0 -1,0 +1,30 @@@
++/*
++ *  copyright (c) 2006 IBM Corporation
++ *  Authored by: Mike D. Day <ncmike@us.ibm.com>
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License version 2 as
++ *  published by the Free Software Foundation.
++ */
++
++#ifndef _HYP_SYSFS_H_
++#define _HYP_SYSFS_H_
++
++#include <linux/kobject.h>
++#include <linux/sysfs.h>
++
++#define HYPERVISOR_ATTR_RO(_name) \
++static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
++
++#define HYPERVISOR_ATTR_RW(_name) \
++static struct hyp_sysfs_attr _name##_attr = \
++      __ATTR(_name, 0644, _name##_show, _name##_store)
++
++struct hyp_sysfs_attr {
++      struct attribute attr;
++      ssize_t (*show)(struct hyp_sysfs_attr *, char *);
++      ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
++      void *hyp_attr_data;
++};
++
++#endif /* _HYP_SYSFS_H_ */
diff --cc include/xen/interface/COPYING

index 0000000,0000000..ffc6d61

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/COPYING
@@@ -1,0 -1,0 +1,38 @@@
++XEN NOTICE
++==========
++
++This copyright applies to all files within this subdirectory and its
++subdirectories:
++  include/public/*.h
++  include/public/hvm/*.h
++  include/public/io/*.h
++
++The intention is that these files can be freely copied into the source
++tree of an operating system when porting that OS to run on Xen. Doing
++so does *not* cause the OS to become subject to the terms of the GPL.
++
++All other files in the Xen source distribution are covered by version
++2 of the GNU General Public License except where explicitly stated
++otherwise within individual source files.
++
++ -- Keir Fraser (on behalf of the Xen team)
++
++=====================================================================
++
++Permission is hereby granted, free of charge, to any person obtaining a copy
++of this software and associated documentation files (the "Software"), to
++deal in the Software without restriction, including without limitation the
++rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++sell copies of the Software, and to permit persons to whom the Software is
++furnished to do so, subject to the following conditions:
++
++The above copyright notice and this permission notice shall be included in
++all copies or substantial portions of the Software.
++
++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
++FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
++AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
++LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
++FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
++DEALINGS IN THE SOFTWARE.
diff --cc include/xen/interface/arch-x86/cpuid.h

index 0000000,0000000..d9bd627

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/arch-x86/cpuid.h
@@@ -1,0 -1,0 +1,68 @@@
++/******************************************************************************
++ * arch-x86/cpuid.h
++ * 
++ * CPUID interface to Xen.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ * 
++ * Copyright (c) 2007 Citrix Systems, Inc.
++ * 
++ * Authors:
++ *    Keir Fraser <keir@xen.org>
++ */
++
++#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__
++#define __XEN_PUBLIC_ARCH_X86_CPUID_H__
++
++/* Xen identification leaves start at 0x40000000. */
++#define XEN_CPUID_FIRST_LEAF 0x40000000
++#define XEN_CPUID_LEAF(i)    (XEN_CPUID_FIRST_LEAF + (i))
++
++/*
++ * Leaf 1 (0x40000000)
++ * EAX: Largest Xen-information leaf. All leaves up to an including @EAX
++ *      are supported by the Xen host.
++ * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification
++ *      of a Xen host.
++ */
++#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */
++#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */
++#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */
++
++/*
++ * Leaf 2 (0x40000001)
++ * EAX[31:16]: Xen major version.
++ * EAX[15: 0]: Xen minor version.
++ * EBX-EDX: Reserved (currently all zeroes).
++ */
++
++/*
++ * Leaf 3 (0x40000002)
++ * EAX: Number of hypercall transfer pages. This register is always guaranteed
++ *      to specify one hypercall page.
++ * EBX: Base address of Xen-specific MSRs.
++ * ECX: Features 1. Unused bits are set to zero.
++ * EDX: Features 2. Unused bits are set to zero.
++ */
++
++/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */
++#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
++#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD  (1u<<0)
++
++#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
diff --cc include/xen/interface/arch-x86/hvm/save.h

index 0000000,0000000..0600b12

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/arch-x86/hvm/save.h
@@@ -1,0 -1,0 +1,588 @@@
++/* 
++ * Structure definitions for HVM state that is held by Xen and must
++ * be saved along with the domain's memory and device-model state.
++ * 
++ * Copyright (c) 2007 XenSource Ltd.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_PUBLIC_HVM_SAVE_X86_H__
++#define __XEN_PUBLIC_HVM_SAVE_X86_H__
++
++/* 
++ * Save/restore header: general info about the save file. 
++ */
++
++#define HVM_FILE_MAGIC   0x54381286
++#define HVM_FILE_VERSION 0x00000001
++
++struct hvm_save_header {
++    uint32_t magic;             /* Must be HVM_FILE_MAGIC */
++    uint32_t version;           /* File format version */
++    uint64_t changeset;         /* Version of Xen that saved this file */
++    uint32_t cpuid;             /* CPUID[0x01][%eax] on the saving machine */
++    uint32_t gtsc_khz;        /* Guest's TSC frequency in kHz */
++};
++
++DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);
++
++
++/*
++ * Processor
++ *
++ * Compat: Pre-3.4 didn't have msr_tsc_aux
++ */
++
++struct hvm_hw_cpu {
++    uint8_t  fpu_regs[512];
++
++    uint64_t rax;
++    uint64_t rbx;
++    uint64_t rcx;
++    uint64_t rdx;
++    uint64_t rbp;
++    uint64_t rsi;
++    uint64_t rdi;
++    uint64_t rsp;
++    uint64_t r8;
++    uint64_t r9;
++    uint64_t r10;
++    uint64_t r11;
++    uint64_t r12;
++    uint64_t r13;
++    uint64_t r14;
++    uint64_t r15;
++
++    uint64_t rip;
++    uint64_t rflags;
++
++    uint64_t cr0;
++    uint64_t cr2;
++    uint64_t cr3;
++    uint64_t cr4;
++
++    uint64_t dr0;
++    uint64_t dr1;
++    uint64_t dr2;
++    uint64_t dr3;
++    uint64_t dr6;
++    uint64_t dr7;    
++
++    uint32_t cs_sel;
++    uint32_t ds_sel;
++    uint32_t es_sel;
++    uint32_t fs_sel;
++    uint32_t gs_sel;
++    uint32_t ss_sel;
++    uint32_t tr_sel;
++    uint32_t ldtr_sel;
++
++    uint32_t cs_limit;
++    uint32_t ds_limit;
++    uint32_t es_limit;
++    uint32_t fs_limit;
++    uint32_t gs_limit;
++    uint32_t ss_limit;
++    uint32_t tr_limit;
++    uint32_t ldtr_limit;
++    uint32_t idtr_limit;
++    uint32_t gdtr_limit;
++
++    uint64_t cs_base;
++    uint64_t ds_base;
++    uint64_t es_base;
++    uint64_t fs_base;
++    uint64_t gs_base;
++    uint64_t ss_base;
++    uint64_t tr_base;
++    uint64_t ldtr_base;
++    uint64_t idtr_base;
++    uint64_t gdtr_base;
++
++    uint32_t cs_arbytes;
++    uint32_t ds_arbytes;
++    uint32_t es_arbytes;
++    uint32_t fs_arbytes;
++    uint32_t gs_arbytes;
++    uint32_t ss_arbytes;
++    uint32_t tr_arbytes;
++    uint32_t ldtr_arbytes;
++
++    uint64_t sysenter_cs;
++    uint64_t sysenter_esp;
++    uint64_t sysenter_eip;
++
++    /* msr for em64t */
++    uint64_t shadow_gs;
++
++    /* msr content saved/restored. */
++    uint64_t msr_flags;
++    uint64_t msr_lstar;
++    uint64_t msr_star;
++    uint64_t msr_cstar;
++    uint64_t msr_syscall_mask;
++    uint64_t msr_efer;
++    uint64_t msr_tsc_aux;
++
++    /* guest's idea of what rdtsc() would return */
++    uint64_t tsc;
++
++    /* pending event, if any */
++    union {
++        uint32_t pending_event;
++        struct {
++            uint8_t  pending_vector:8;
++            uint8_t  pending_type:3;
++            uint8_t  pending_error_valid:1;
++            uint32_t pending_reserved:19;
++            uint8_t  pending_valid:1;
++        };
++    };
++    /* error code for pending event */
++    uint32_t error_code;
++};
++
++struct hvm_hw_cpu_compat {
++    uint8_t  fpu_regs[512];
++
++    uint64_t rax;
++    uint64_t rbx;
++    uint64_t rcx;
++    uint64_t rdx;
++    uint64_t rbp;
++    uint64_t rsi;
++    uint64_t rdi;
++    uint64_t rsp;
++    uint64_t r8;
++    uint64_t r9;
++    uint64_t r10;
++    uint64_t r11;
++    uint64_t r12;
++    uint64_t r13;
++    uint64_t r14;
++    uint64_t r15;
++
++    uint64_t rip;
++    uint64_t rflags;
++
++    uint64_t cr0;
++    uint64_t cr2;
++    uint64_t cr3;
++    uint64_t cr4;
++
++    uint64_t dr0;
++    uint64_t dr1;
++    uint64_t dr2;
++    uint64_t dr3;
++    uint64_t dr6;
++    uint64_t dr7;    
++
++    uint32_t cs_sel;
++    uint32_t ds_sel;
++    uint32_t es_sel;
++    uint32_t fs_sel;
++    uint32_t gs_sel;
++    uint32_t ss_sel;
++    uint32_t tr_sel;
++    uint32_t ldtr_sel;
++
++    uint32_t cs_limit;
++    uint32_t ds_limit;
++    uint32_t es_limit;
++    uint32_t fs_limit;
++    uint32_t gs_limit;
++    uint32_t ss_limit;
++    uint32_t tr_limit;
++    uint32_t ldtr_limit;
++    uint32_t idtr_limit;
++    uint32_t gdtr_limit;
++
++    uint64_t cs_base;
++    uint64_t ds_base;
++    uint64_t es_base;
++    uint64_t fs_base;
++    uint64_t gs_base;
++    uint64_t ss_base;
++    uint64_t tr_base;
++    uint64_t ldtr_base;
++    uint64_t idtr_base;
++    uint64_t gdtr_base;
++
++    uint32_t cs_arbytes;
++    uint32_t ds_arbytes;
++    uint32_t es_arbytes;
++    uint32_t fs_arbytes;
++    uint32_t gs_arbytes;
++    uint32_t ss_arbytes;
++    uint32_t tr_arbytes;
++    uint32_t ldtr_arbytes;
++
++    uint64_t sysenter_cs;
++    uint64_t sysenter_esp;
++    uint64_t sysenter_eip;
++
++    /* msr for em64t */
++    uint64_t shadow_gs;
++
++    /* msr content saved/restored. */
++    uint64_t msr_flags;
++    uint64_t msr_lstar;
++    uint64_t msr_star;
++    uint64_t msr_cstar;
++    uint64_t msr_syscall_mask;
++    uint64_t msr_efer;
++    /*uint64_t msr_tsc_aux; COMPAT */
++
++    /* guest's idea of what rdtsc() would return */
++    uint64_t tsc;
++
++    /* pending event, if any */
++    union {
++        uint32_t pending_event;
++        struct {
++            uint8_t  pending_vector:8;
++            uint8_t  pending_type:3;
++            uint8_t  pending_error_valid:1;
++            uint32_t pending_reserved:19;
++            uint8_t  pending_valid:1;
++        };
++    };
++    /* error code for pending event */
++    uint32_t error_code;
++};
++
++static inline int _hvm_hw_fix_cpu(void *h) {
++    struct hvm_hw_cpu *new=h;
++    struct hvm_hw_cpu_compat *old=h;
++
++    /* If we copy from the end backwards, we should
++     * be able to do the modification in-place */
++    new->error_code=old->error_code;
++    new->pending_event=old->pending_event;
++    new->tsc=old->tsc;
++    new->msr_tsc_aux=0;
++
++    return 0;
++}
++
++DECLARE_HVM_SAVE_TYPE_COMPAT(CPU, 2, struct hvm_hw_cpu, \
++                             struct hvm_hw_cpu_compat, _hvm_hw_fix_cpu);
++
++/*
++ * PIC
++ */
++
++struct hvm_hw_vpic {
++    /* IR line bitmasks. */
++    uint8_t irr;
++    uint8_t imr;
++    uint8_t isr;
++
++    /* Line IRx maps to IRQ irq_base+x */
++    uint8_t irq_base;
++
++    /*
++     * Where are we in ICW2-4 initialisation (0 means no init in progress)?
++     * Bits 0-1 (=x): Next write at A=1 sets ICW(x+1).
++     * Bit 2: ICW1.IC4  (1 == ICW4 included in init sequence)
++     * Bit 3: ICW1.SNGL (0 == ICW3 included in init sequence)
++     */
++    uint8_t init_state:4;
++
++    /* IR line with highest priority. */
++    uint8_t priority_add:4;
++
++    /* Reads from A=0 obtain ISR or IRR? */
++    uint8_t readsel_isr:1;
++
++    /* Reads perform a polling read? */
++    uint8_t poll:1;
++
++    /* Automatically clear IRQs from the ISR during INTA? */
++    uint8_t auto_eoi:1;
++
++    /* Automatically rotate IRQ priorities during AEOI? */
++    uint8_t rotate_on_auto_eoi:1;
++
++    /* Exclude slave inputs when considering in-service IRQs? */
++    uint8_t special_fully_nested_mode:1;
++
++    /* Special mask mode excludes masked IRs from AEOI and priority checks. */
++    uint8_t special_mask_mode:1;
++
++    /* Is this a master PIC or slave PIC? (NB. This is not programmable.) */
++    uint8_t is_master:1;
++
++    /* Edge/trigger selection. */
++    uint8_t elcr;
++
++    /* Virtual INT output. */
++    uint8_t int_output;
++};
++
++DECLARE_HVM_SAVE_TYPE(PIC, 3, struct hvm_hw_vpic);
++
++
++/*
++ * IO-APIC
++ */
++
++#ifdef __ia64__
++#define VIOAPIC_IS_IOSAPIC 1
++#define VIOAPIC_NUM_PINS  24
++#else
++#define VIOAPIC_NUM_PINS  48 /* 16 ISA IRQs, 32 non-legacy PCI IRQS. */
++#endif
++
++struct hvm_hw_vioapic {
++    uint64_t base_address;
++    uint32_t ioregsel;
++    uint32_t id;
++    union vioapic_redir_entry
++    {
++        uint64_t bits;
++        struct {
++            uint8_t vector;
++            uint8_t delivery_mode:3;
++            uint8_t dest_mode:1;
++            uint8_t delivery_status:1;
++            uint8_t polarity:1;
++            uint8_t remote_irr:1;
++            uint8_t trig_mode:1;
++            uint8_t mask:1;
++            uint8_t reserve:7;
++#if !VIOAPIC_IS_IOSAPIC
++            uint8_t reserved[4];
++            uint8_t dest_id;
++#else
++            uint8_t reserved[3];
++            uint16_t dest_id;
++#endif
++        } fields;
++    } redirtbl[VIOAPIC_NUM_PINS];
++};
++
++DECLARE_HVM_SAVE_TYPE(IOAPIC, 4, struct hvm_hw_vioapic);
++
++
++/*
++ * LAPIC
++ */
++
++struct hvm_hw_lapic {
++    uint64_t             apic_base_msr;
++    uint32_t             disabled; /* VLAPIC_xx_DISABLED */
++    uint32_t             timer_divisor;
++    uint64_t             tdt_msr;
++};
++
++DECLARE_HVM_SAVE_TYPE(LAPIC, 5, struct hvm_hw_lapic);
++
++struct hvm_hw_lapic_regs {
++    uint8_t data[1024];
++};
++
++DECLARE_HVM_SAVE_TYPE(LAPIC_REGS, 6, struct hvm_hw_lapic_regs);
++
++
++/*
++ * IRQs
++ */
++
++struct hvm_hw_pci_irqs {
++    /*
++     * Virtual interrupt wires for a single PCI bus.
++     * Indexed by: device*4 + INTx#.
++     */
++    union {
++        unsigned long i[16 / sizeof (unsigned long)]; /* DECLARE_BITMAP(i, 32*4); */
++        uint64_t pad[2];
++    };
++};
++
++DECLARE_HVM_SAVE_TYPE(PCI_IRQ, 7, struct hvm_hw_pci_irqs);
++
++struct hvm_hw_isa_irqs {
++    /*
++     * Virtual interrupt wires for ISA devices.
++     * Indexed by ISA IRQ (assumes no ISA-device IRQ sharing).
++     */
++    union {
++        unsigned long i[1];  /* DECLARE_BITMAP(i, 16); */
++        uint64_t pad[1];
++    };
++};
++
++DECLARE_HVM_SAVE_TYPE(ISA_IRQ, 8, struct hvm_hw_isa_irqs);
++
++struct hvm_hw_pci_link {
++    /*
++     * PCI-ISA interrupt router.
++     * Each PCI <device:INTx#> is 'wire-ORed' into one of four links using
++     * the traditional 'barber's pole' mapping ((device + INTx#) & 3).
++     * The router provides a programmable mapping from each link to a GSI.
++     */
++    uint8_t route[4];
++    uint8_t pad0[4];
++};
++
++DECLARE_HVM_SAVE_TYPE(PCI_LINK, 9, struct hvm_hw_pci_link);
++
++/* 
++ *  PIT
++ */
++
++struct hvm_hw_pit {
++    struct hvm_hw_pit_channel {
++        uint32_t count; /* can be 65536 */
++        uint16_t latched_count;
++        uint8_t count_latched;
++        uint8_t status_latched;
++        uint8_t status;
++        uint8_t read_state;
++        uint8_t write_state;
++        uint8_t write_latch;
++        uint8_t rw_mode;
++        uint8_t mode;
++        uint8_t bcd; /* not supported */
++        uint8_t gate; /* timer start */
++    } channels[3];  /* 3 x 16 bytes */
++    uint32_t speaker_data_on;
++    uint32_t pad0;
++};
++
++DECLARE_HVM_SAVE_TYPE(PIT, 10, struct hvm_hw_pit);
++
++
++/* 
++ * RTC
++ */ 
++
++#define RTC_CMOS_SIZE 14
++struct hvm_hw_rtc {
++    /* CMOS bytes */
++    uint8_t cmos_data[RTC_CMOS_SIZE];
++    /* Index register for 2-part operations */
++    uint8_t cmos_index;
++    uint8_t pad0;
++};
++
++DECLARE_HVM_SAVE_TYPE(RTC, 11, struct hvm_hw_rtc);
++
++
++/*
++ * HPET
++ */
++
++#define HPET_TIMER_NUM     3    /* 3 timers supported now */
++struct hvm_hw_hpet {
++    /* Memory-mapped, software visible registers */
++    uint64_t capability;        /* capabilities */
++    uint64_t res0;              /* reserved */
++    uint64_t config;            /* configuration */
++    uint64_t res1;              /* reserved */
++    uint64_t isr;               /* interrupt status reg */
++    uint64_t res2[25];          /* reserved */
++    uint64_t mc64;              /* main counter */
++    uint64_t res3;              /* reserved */
++    struct {                    /* timers */
++        uint64_t config;        /* configuration/cap */
++        uint64_t cmp;           /* comparator */
++        uint64_t fsb;           /* FSB route, not supported now */
++        uint64_t res4;          /* reserved */
++    } timers[HPET_TIMER_NUM];
++    uint64_t res5[4*(24-HPET_TIMER_NUM)];  /* reserved, up to 0x3ff */
++
++    /* Hidden register state */
++    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
++};
++
++DECLARE_HVM_SAVE_TYPE(HPET, 12, struct hvm_hw_hpet);
++
++
++/*
++ * PM timer
++ */
++
++struct hvm_hw_pmtimer {
++    uint32_t tmr_val;   /* PM_TMR_BLK.TMR_VAL: 32bit free-running counter */
++    uint16_t pm1a_sts;  /* PM1a_EVT_BLK.PM1a_STS: status register */
++    uint16_t pm1a_en;   /* PM1a_EVT_BLK.PM1a_EN: enable register */
++};
++
++DECLARE_HVM_SAVE_TYPE(PMTIMER, 13, struct hvm_hw_pmtimer);
++
++/*
++ * MTRR MSRs
++ */
++
++struct hvm_hw_mtrr {
++#define MTRR_VCNT 8
++#define NUM_FIXED_MSR 11
++    uint64_t msr_pat_cr;
++    /* mtrr physbase & physmask msr pair*/
++    uint64_t msr_mtrr_var[MTRR_VCNT*2];
++    uint64_t msr_mtrr_fixed[NUM_FIXED_MSR];
++    uint64_t msr_mtrr_cap;
++    uint64_t msr_mtrr_def_type;
++};
++
++DECLARE_HVM_SAVE_TYPE(MTRR, 14, struct hvm_hw_mtrr);
++
++/*
++ * Viridian hypervisor context.
++ */
++
++struct hvm_viridian_context {
++    uint64_t hypercall_gpa;
++    uint64_t guest_os_id;
++};
++
++DECLARE_HVM_SAVE_TYPE(VIRIDIAN, 15, struct hvm_viridian_context);
++
++
++/*
++ * The save area of XSAVE/XRSTOR.
++ */
++
++struct hvm_hw_cpu_xsave {
++    uint64_t xfeature_mask;
++    uint64_t xcr0;                 /* Updated by XSETBV */
++    uint64_t xcr0_accum;           /* Updated by XSETBV */
++    struct {
++        struct { char x[512]; } fpu_sse;
++
++        struct {
++            uint64_t xstate_bv;         /* Updated by XRSTOR */
++            uint64_t reserved[7];
++        } xsave_hdr;                    /* The 64-byte header */
++
++        struct { char x[0]; } ymm;    /* YMM */
++    } save_area;
++} __attribute__((packed));
++
++#define CPU_XSAVE_CODE  16
++
++/* 
++ * Largest type-code in use
++ */
++#define HVM_SAVE_CODE_MAX 16
++
++#endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */
diff --cc include/xen/interface/arch-x86/xen-mca.h

index 0000000,0000000..dca6b3e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen-mca.h
@@@ -1,0 -1,0 +1,440 @@@
++/******************************************************************************
++ * arch-x86/mca.h
++ * 
++ * Contributed by Advanced Micro Devices, Inc.
++ * Author: Christoph Egger <Christoph.Egger@amd.com>
++ *
++ * Guest OS machine check interface to x86 Xen.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++/* Full MCA functionality has the following Usecases from the guest side:
++ *
++ * Must have's:
++ * 1. Dom0 and DomU register machine check trap callback handlers
++ *    (already done via "set_trap_table" hypercall)
++ * 2. Dom0 registers machine check event callback handler
++ *    (doable via EVTCHNOP_bind_virq)
++ * 3. Dom0 and DomU fetches machine check data
++ * 4. Dom0 wants Xen to notify a DomU
++ * 5. Dom0 gets DomU ID from physical address
++ * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy")
++ *
++ * Nice to have's:
++ * 7. Dom0 wants Xen to deactivate a physical CPU
++ *    This is better done as separate task, physical CPU hotplugging,
++ *    and hypercall(s) should be sysctl's
++ * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to
++ *    move a DomU (or Dom0 itself) away from a malicious page
++ *    producing correctable errors.
++ * 9. offlining physical page:
++ *    Xen free's and never re-uses a certain physical page.
++ * 10. Testfacility: Allow Dom0 to write values into machine check MSR's
++ *     and tell Xen to trigger a machine check
++ */
++
++#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
++#define __XEN_PUBLIC_ARCH_X86_MCA_H__
++
++/* Hypercall */
++#define __HYPERVISOR_mca __HYPERVISOR_arch_0
++
++/*
++ * The xen-unstable repo has interface version 0x03000001; out interface
++ * is incompatible with that and any future minor revisions, so we
++ * choose a different version number range that is numerically less
++ * than that used in xen-unstable.
++ */
++#define XEN_MCA_INTERFACE_VERSION 0x01ecc003
++
++/* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */
++#define XEN_MC_NONURGENT  0x0001
++/* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */
++#define XEN_MC_URGENT     0x0002
++/* IN: Dom0 acknowledges previosly-fetched telemetry */
++#define XEN_MC_ACK        0x0004
++
++/* OUT: All is ok */
++#define XEN_MC_OK           0x0
++/* OUT: Domain could not fetch data. */
++#define XEN_MC_FETCHFAILED  0x1
++/* OUT: There was no machine check data to fetch. */
++#define XEN_MC_NODATA       0x2
++/* OUT: Between notification time and this hypercall an other
++ *  (most likely) correctable error happened. The fetched data,
++ *  does not match the original machine check data. */
++#define XEN_MC_NOMATCH      0x4
++
++/* OUT: DomU did not register MC NMI handler. Try something else. */
++#define XEN_MC_CANNOTHANDLE 0x8
++/* OUT: Notifying DomU failed. Retry later or try something else. */
++#define XEN_MC_NOTDELIVERED 0x10
++/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */
++
++
++#ifndef __ASSEMBLY__
++
++#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */
++
++/*
++ * Machine Check Architecure:
++ * structs are read-only and used to report all kinds of
++ * correctable and uncorrectable errors detected by the HW.
++ * Dom0 and DomU: register a handler to get notified.
++ * Dom0 only: Correctable errors are reported via VIRQ_MCA
++ * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers
++ */
++#define MC_TYPE_GLOBAL          0
++#define MC_TYPE_BANK            1
++#define MC_TYPE_EXTENDED        2
++#define MC_TYPE_RECOVERY        3
++
++struct mcinfo_common {
++    uint16_t type;      /* structure type */
++    uint16_t size;      /* size of this struct in bytes */
++};
++
++
++#define MC_FLAG_CORRECTABLE     (1 << 0)
++#define MC_FLAG_UNCORRECTABLE   (1 << 1)
++#define MC_FLAG_RECOVERABLE   (1 << 2)
++#define MC_FLAG_POLLED                (1 << 3)
++#define MC_FLAG_RESET         (1 << 4)
++#define MC_FLAG_CMCI          (1 << 5)
++#define MC_FLAG_MCE           (1 << 6)
++/* contains global x86 mc information */
++struct mcinfo_global {
++    struct mcinfo_common common;
++
++    /* running domain at the time in error (most likely the impacted one) */
++    uint16_t mc_domid;
++    uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
++    uint32_t mc_socketid; /* physical socket of the physical core */
++    uint16_t mc_coreid; /* physical impacted core */
++    uint16_t mc_core_threadid; /* core thread of physical core */
++    uint32_t mc_apicid;
++    uint32_t mc_flags;
++    uint64_t mc_gstatus; /* global status */
++};
++
++/* contains bank local x86 mc information */
++struct mcinfo_bank {
++    struct mcinfo_common common;
++
++    uint16_t mc_bank; /* bank nr */
++    uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on dom0
++                        * and if mc_addr is valid. Never valid on DomU. */
++    uint64_t mc_status; /* bank status */
++    uint64_t mc_addr;   /* bank address, only valid
++                         * if addr bit is set in mc_status */
++    uint64_t mc_misc;
++    uint64_t mc_ctrl2;
++    uint64_t mc_tsc;
++};
++
++
++struct mcinfo_msr {
++    uint64_t reg;   /* MSR */
++    uint64_t value; /* MSR value */
++};
++
++/* contains mc information from other
++ * or additional mc MSRs */ 
++struct mcinfo_extended {
++    struct mcinfo_common common;
++
++    /* You can fill up to five registers.
++     * If you need more, then use this structure
++     * multiple times. */
++
++    uint32_t mc_msrs; /* Number of msr with valid values. */
++    /*
++     * Currently Intel extended MSR (32/64) include all gp registers
++     * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be
++     * useful at present. So expand this array to 16/32 to leave room.
++     */
++    struct mcinfo_msr mc_msr[sizeof(void *) * 4];
++};
++
++/* Recovery Action flags. Giving recovery result information to DOM0 */
++
++/* Xen takes successful recovery action, the error is recovered */
++#define REC_ACTION_RECOVERED (0x1 << 0)
++/* No action is performed by XEN */
++#define REC_ACTION_NONE (0x1 << 1)
++/* It's possible DOM0 might take action ownership in some case */
++#define REC_ACTION_NEED_RESET (0x1 << 2)
++
++/* Different Recovery Action types, if the action is performed successfully,
++ * REC_ACTION_RECOVERED flag will be returned.
++ */
++
++/* Page Offline Action */
++#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
++/* CPU offline Action */
++#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
++/* L3 cache disable Action */
++#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
++
++/* Below interface used between XEN/DOM0 for passing XEN's recovery action 
++ * information to DOM0. 
++ * usage Senario: After offlining broken page, XEN might pass its page offline
++ * recovery action result to DOM0. DOM0 will save the information in 
++ * non-volatile memory for further proactive actions, such as offlining the
++ * easy broken page earlier when doing next reboot.
++*/
++struct page_offline_action
++{
++    /* Params for passing the offlined page number to DOM0 */
++    uint64_t mfn;
++    uint64_t status;
++};
++
++struct cpu_offline_action
++{
++    /* Params for passing the identity of the offlined CPU to DOM0 */
++    uint32_t mc_socketid;
++    uint16_t mc_coreid;
++    uint16_t mc_core_threadid;
++};
++
++#define MAX_UNION_SIZE 16
++struct mcinfo_recovery
++{
++    struct mcinfo_common common;
++    uint16_t mc_bank; /* bank nr */
++    uint8_t action_flags;
++    uint8_t action_types;
++    union {
++        struct page_offline_action page_retire;
++        struct cpu_offline_action cpu_offline;
++        uint8_t pad[MAX_UNION_SIZE];
++    } action_info;
++};
++
++
++#define MCINFO_HYPERCALLSIZE  1024
++#define MCINFO_MAXSIZE                768
++
++#define MCINFO_FLAGS_UNCOMPLETE 0x1
++struct mc_info {
++    /* Number of mcinfo_* entries in mi_data */
++    uint32_t mi_nentries;
++    uint32_t flags;
++    uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8];
++};
++typedef struct mc_info mc_info_t;
++DEFINE_XEN_GUEST_HANDLE(mc_info_t);
++
++#define __MC_MSR_ARRAYSIZE 8
++#define __MC_NMSRS 1
++#define MC_NCAPS      7       /* 7 CPU feature flag words */
++#define MC_CAPS_STD_EDX       0       /* cpuid level 0x00000001 (%edx) */
++#define MC_CAPS_AMD_EDX       1       /* cpuid level 0x80000001 (%edx) */
++#define MC_CAPS_TM    2       /* cpuid level 0x80860001 (TransMeta) */
++#define MC_CAPS_LINUX 3       /* Linux-defined */
++#define MC_CAPS_STD_ECX       4       /* cpuid level 0x00000001 (%ecx) */
++#define MC_CAPS_VIA   5       /* cpuid level 0xc0000001 */
++#define MC_CAPS_AMD_ECX       6       /* cpuid level 0x80000001 (%ecx) */
++
++struct mcinfo_logical_cpu {
++    uint32_t mc_cpunr;          
++    uint32_t mc_chipid; 
++    uint16_t mc_coreid;
++    uint16_t mc_threadid;
++    uint32_t mc_apicid;
++    uint32_t mc_clusterid;
++    uint32_t mc_ncores;
++    uint32_t mc_ncores_active;
++    uint32_t mc_nthreads;
++    int32_t mc_cpuid_level;
++    uint32_t mc_family;
++    uint32_t mc_vendor;
++    uint32_t mc_model;
++    uint32_t mc_step;
++    char mc_vendorid[16];
++    char mc_brandid[64];
++    uint32_t mc_cpu_caps[MC_NCAPS];
++    uint32_t mc_cache_size;
++    uint32_t mc_cache_alignment;
++    int32_t mc_nmsrvals;
++    struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE];
++};
++typedef struct mcinfo_logical_cpu xen_mc_logical_cpu_t;
++DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_cpu_t);
++
++
++/* 
++ * OS's should use these instead of writing their own lookup function
++ * each with its own bugs and drawbacks.
++ * We use macros instead of static inline functions to allow guests
++ * to include this header in assembly files (*.S).
++ */
++/* Prototype:
++ *    uint32_t x86_mcinfo_nentries(struct mc_info *mi);
++ */
++#define x86_mcinfo_nentries(_mi)    \
++    (_mi)->mi_nentries
++/* Prototype:
++ *    struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
++ */
++#define x86_mcinfo_first(_mi)       \
++    ((struct mcinfo_common *)(_mi)->mi_data)
++/* Prototype:
++ *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
++ */
++#define x86_mcinfo_next(_mic)       \
++    ((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size))
++
++/* Prototype:
++ *    void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
++ */
++#define x86_mcinfo_lookup(_ret, _mi, _type)    \
++    do {                                                        \
++        uint32_t found, i;                                      \
++        struct mcinfo_common *_mic;                             \
++                                                                \
++        found = 0;                                              \
++      (_ret) = NULL;                                          \
++      if (_mi == NULL) break;                                 \
++        _mic = x86_mcinfo_first(_mi);                           \
++        for (i = 0; i < x86_mcinfo_nentries(_mi); i++) {        \
++            if (_mic->type == (_type)) {                        \
++                found = 1;                                      \
++                break;                                          \
++            }                                                   \
++            _mic = x86_mcinfo_next(_mic);                       \
++        }                                                       \
++        (_ret) = found ? _mic : NULL;                           \
++    } while (0)
++
++
++/* Usecase 1
++ * Register machine check trap callback handler
++ *    (already done via "set_trap_table" hypercall)
++ */
++
++/* Usecase 2
++ * Dom0 registers machine check event callback handler
++ * done by EVTCHNOP_bind_virq
++ */
++
++/* Usecase 3
++ * Fetch machine check data from hypervisor.
++ * Note, this hypercall is special, because both Dom0 and DomU must use this.
++ */
++#define XEN_MC_fetch            1
++struct xen_mc_fetch {
++    /* IN/OUT variables. */
++    uint32_t flags;   /* IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
++                           XEN_MC_ACK if ack'ing an earlier fetch */
++                      /* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED,
++                         XEN_MC_NODATA, XEN_MC_NOMATCH */
++    uint32_t _pad0;
++    uint64_t fetch_id;        /* OUT: id for ack, IN: id we are ack'ing */
++
++    /* OUT variables. */
++    XEN_GUEST_HANDLE(mc_info_t) data;
++};
++typedef struct xen_mc_fetch xen_mc_fetch_t;
++DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
++
++
++/* Usecase 4
++ * This tells the hypervisor to notify a DomU about the machine check error
++ */
++#define XEN_MC_notifydomain     2
++struct xen_mc_notifydomain {
++    /* IN variables. */
++    uint16_t mc_domid;    /* The unprivileged domain to notify. */
++    uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
++                           * Usually echo'd value from the fetch hypercall. */
++
++    /* IN/OUT variables. */
++    uint32_t flags;
++
++/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
++/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */
++};
++typedef struct xen_mc_notifydomain xen_mc_notifydomain_t;
++DEFINE_XEN_GUEST_HANDLE(xen_mc_notifydomain_t);
++
++#define XEN_MC_physcpuinfo 3
++struct xen_mc_physcpuinfo {
++      /* IN/OUT */
++      uint32_t ncpus;
++      uint32_t _pad0;
++      /* OUT */
++      XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info;
++};
++
++#define XEN_MC_msrinject    4
++#define MC_MSRINJ_MAXMSRS       8
++struct xen_mc_msrinject {
++       /* IN */
++      uint32_t mcinj_cpunr;           /* target processor id */
++      uint32_t mcinj_flags;           /* see MC_MSRINJ_F_* below */
++      uint32_t mcinj_count;           /* 0 .. count-1 in array are valid */
++      uint32_t _pad0;
++      struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS];
++};
++
++/* Flags for mcinj_flags above; bits 16-31 are reserved */
++#define MC_MSRINJ_F_INTERPOSE   0x1
++
++#define XEN_MC_mceinject    5
++struct xen_mc_mceinject {
++      unsigned int mceinj_cpunr;      /* target processor id */
++};
++
++#if defined(__XEN__) || defined(__XEN_TOOLS__)
++#define XEN_MC_inject_v2        6
++#define XEN_MC_INJECT_TYPE_MASK     0x7
++#define XEN_MC_INJECT_TYPE_MCE      0x0
++#define XEN_MC_INJECT_TYPE_CMCI     0x1
++
++#define XEN_MC_INJECT_CPU_BROADCAST 0x8
++
++struct xen_mc_inject_v2 {
++      uint32_t flags;
++      struct xenctl_cpumap cpumap;
++};
++#endif
++
++struct xen_mc {
++    uint32_t cmd;
++    uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
++    union {
++        struct xen_mc_fetch        mc_fetch;
++        struct xen_mc_notifydomain mc_notifydomain;
++        struct xen_mc_physcpuinfo  mc_physcpuinfo;
++        struct xen_mc_msrinject    mc_msrinject;
++        struct xen_mc_mceinject    mc_mceinject;
++#if defined(__XEN__) || defined(__XEN_TOOLS__)
++        struct xen_mc_inject_v2    mc_inject_v2;
++#endif
++    } u;
++};
++typedef struct xen_mc xen_mc_t;
++DEFINE_XEN_GUEST_HANDLE(xen_mc_t);
++
++#endif /* __ASSEMBLY__ */
++
++#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
diff --cc include/xen/interface/arch-x86/xen-x86_32.h

index 0000000,0000000..de584ea

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen-x86_32.h
@@@ -1,0 -1,0 +1,171 @@@
++/******************************************************************************
++ * xen-x86_32.h
++ * 
++ * Guest OS interface to x86 32-bit Xen.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2004-2007, K A Fraser
++ */
++
++#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
++#define __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
++
++/*
++ * Hypercall interface:
++ *  Input:  %ebx, %ecx, %edx, %esi, %edi (arguments 1-5)
++ *  Output: %eax
++ * Access is via hypercall page (set up by guest loader or via a Xen MSR):
++ *  call hypercall_page + hypercall-number * 32
++ * Clobbered: Argument registers (e.g., 2-arg hypercall clobbers %ebx,%ecx)
++ */
++
++/*
++ * These flat segments are in the Xen-private section of every GDT. Since these
++ * are also present in the initial GDT, many OSes will be able to avoid
++ * installing their own GDT.
++ */
++#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
++#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
++#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
++#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
++#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
++#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
++
++#define FLAT_KERNEL_CS FLAT_RING1_CS
++#define FLAT_KERNEL_DS FLAT_RING1_DS
++#define FLAT_KERNEL_SS FLAT_RING1_SS
++#define FLAT_USER_CS    FLAT_RING3_CS
++#define FLAT_USER_DS    FLAT_RING3_DS
++#define FLAT_USER_SS    FLAT_RING3_SS
++
++#define __HYPERVISOR_VIRT_START_PAE    0xF5800000
++#define __MACH2PHYS_VIRT_START_PAE     0xF5800000
++#define __MACH2PHYS_VIRT_END_PAE       0xF6800000
++#define HYPERVISOR_VIRT_START_PAE      \
++    mk_unsigned_long(__HYPERVISOR_VIRT_START_PAE)
++#define MACH2PHYS_VIRT_START_PAE       \
++    mk_unsigned_long(__MACH2PHYS_VIRT_START_PAE)
++#define MACH2PHYS_VIRT_END_PAE         \
++    mk_unsigned_long(__MACH2PHYS_VIRT_END_PAE)
++
++/* Non-PAE bounds are obsolete. */
++#define __HYPERVISOR_VIRT_START_NONPAE 0xFC000000
++#define __MACH2PHYS_VIRT_START_NONPAE  0xFC000000
++#define __MACH2PHYS_VIRT_END_NONPAE    0xFC400000
++#define HYPERVISOR_VIRT_START_NONPAE   \
++    mk_unsigned_long(__HYPERVISOR_VIRT_START_NONPAE)
++#define MACH2PHYS_VIRT_START_NONPAE    \
++    mk_unsigned_long(__MACH2PHYS_VIRT_START_NONPAE)
++#define MACH2PHYS_VIRT_END_NONPAE      \
++    mk_unsigned_long(__MACH2PHYS_VIRT_END_NONPAE)
++
++#define __HYPERVISOR_VIRT_START __HYPERVISOR_VIRT_START_PAE
++#define __MACH2PHYS_VIRT_START  __MACH2PHYS_VIRT_START_PAE
++#define __MACH2PHYS_VIRT_END    __MACH2PHYS_VIRT_END_PAE
++
++#ifndef HYPERVISOR_VIRT_START
++#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
++#endif
++
++#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
++#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
++#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>2)
++#ifndef machine_to_phys_mapping
++#define machine_to_phys_mapping ((unsigned long *)MACH2PHYS_VIRT_START)
++#endif
++
++/* 32-/64-bit invariability for control interfaces (domctl/sysctl). */
++#if defined(__XEN__) || defined(__XEN_TOOLS__)
++#undef ___DEFINE_XEN_GUEST_HANDLE
++#define ___DEFINE_XEN_GUEST_HANDLE(name, type)                  \
++    typedef struct { type *p; }                                 \
++        __guest_handle_ ## name;                                \
++    typedef struct { union { type *p; uint64_aligned_t q; }; }  \
++        __guest_handle_64_ ## name
++#undef set_xen_guest_handle_raw
++#define set_xen_guest_handle_raw(hnd, val)                  \
++    do { if ( sizeof(hnd) == 8 ) *(uint64_t *)&(hnd) = 0;   \
++         (hnd).p = val;                                     \
++    } while ( 0 )
++#define uint64_aligned_t uint64_t __attribute__((aligned(8)))
++#define __XEN_GUEST_HANDLE_64(name) __guest_handle_64_ ## name
++#define XEN_GUEST_HANDLE_64(name) __XEN_GUEST_HANDLE_64(name)
++#endif
++
++#ifndef __ASSEMBLY__
++
++struct cpu_user_regs {
++    uint32_t ebx;
++    uint32_t ecx;
++    uint32_t edx;
++    uint32_t esi;
++    uint32_t edi;
++    uint32_t ebp;
++    uint32_t eax;
++    uint16_t error_code;    /* private */
++    uint16_t entry_vector;  /* private */
++    uint32_t eip;
++    uint16_t cs;
++    uint8_t  saved_upcall_mask;
++    uint8_t  _pad0;
++    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
++    uint32_t esp;
++    uint16_t ss, _pad1;
++    uint16_t es, _pad2;
++    uint16_t ds, _pad3;
++    uint16_t fs, _pad4;
++    uint16_t gs, _pad5;
++};
++typedef struct cpu_user_regs cpu_user_regs_t;
++DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
++
++/*
++ * Page-directory addresses above 4GB do not fit into architectural %cr3.
++ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
++ * must use the following accessor macros to pack/unpack valid MFNs.
++ */
++#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
++#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
++
++struct arch_vcpu_info {
++    unsigned long cr2;
++    unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */
++};
++typedef struct arch_vcpu_info arch_vcpu_info_t;
++
++struct xen_callback {
++    unsigned long cs;
++    unsigned long eip;
++};
++typedef struct xen_callback xen_callback_t;
++
++#endif /* !__ASSEMBLY__ */
++
++#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/arch-x86/xen-x86_64.h

index 0000000,0000000..0bdd868

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen-x86_64.h
@@@ -1,0 -1,0 +1,202 @@@
++/******************************************************************************
++ * xen-x86_64.h
++ * 
++ * Guest OS interface to x86 64-bit Xen.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2004-2006, K A Fraser
++ */
++
++#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
++#define __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
++
++/*
++ * Hypercall interface:
++ *  Input:  %rdi, %rsi, %rdx, %r10, %r8 (arguments 1-5)
++ *  Output: %rax
++ * Access is via hypercall page (set up by guest loader or via a Xen MSR):
++ *  call hypercall_page + hypercall-number * 32
++ * Clobbered: argument registers (e.g., 2-arg hypercall clobbers %rdi,%rsi)
++ */
++
++/*
++ * 64-bit segment selectors
++ * These flat segments are in the Xen-private section of every GDT. Since these
++ * are also present in the initial GDT, many OSes will be able to avoid
++ * installing their own GDT.
++ */
++
++#define FLAT_RING3_CS32 0xe023  /* GDT index 260 */
++#define FLAT_RING3_CS64 0xe033  /* GDT index 261 */
++#define FLAT_RING3_DS32 0xe02b  /* GDT index 262 */
++#define FLAT_RING3_DS64 0x0000  /* NULL selector */
++#define FLAT_RING3_SS32 0xe02b  /* GDT index 262 */
++#define FLAT_RING3_SS64 0xe02b  /* GDT index 262 */
++
++#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
++#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
++#define FLAT_KERNEL_DS   FLAT_KERNEL_DS64
++#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
++#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
++#define FLAT_KERNEL_CS   FLAT_KERNEL_CS64
++#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
++#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
++#define FLAT_KERNEL_SS   FLAT_KERNEL_SS64
++
++#define FLAT_USER_DS64 FLAT_RING3_DS64
++#define FLAT_USER_DS32 FLAT_RING3_DS32
++#define FLAT_USER_DS   FLAT_USER_DS64
++#define FLAT_USER_CS64 FLAT_RING3_CS64
++#define FLAT_USER_CS32 FLAT_RING3_CS32
++#define FLAT_USER_CS   FLAT_USER_CS64
++#define FLAT_USER_SS64 FLAT_RING3_SS64
++#define FLAT_USER_SS32 FLAT_RING3_SS32
++#define FLAT_USER_SS   FLAT_USER_SS64
++
++#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
++#define __HYPERVISOR_VIRT_END   0xFFFF880000000000
++#define __MACH2PHYS_VIRT_START  0xFFFF800000000000
++#define __MACH2PHYS_VIRT_END    0xFFFF804000000000
++
++#ifndef HYPERVISOR_VIRT_START
++#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
++#define HYPERVISOR_VIRT_END   mk_unsigned_long(__HYPERVISOR_VIRT_END)
++#endif
++
++#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
++#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
++#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
++#ifndef machine_to_phys_mapping
++#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
++#endif
++
++/*
++ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
++ *  @which == SEGBASE_*  ;  @base == 64-bit base address
++ * Returns 0 on success.
++ */
++#define SEGBASE_FS          0
++#define SEGBASE_GS_USER     1
++#define SEGBASE_GS_KERNEL   2
++#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
++
++/*
++ * int HYPERVISOR_iret(void)
++ * All arguments are on the kernel stack, in the following format.
++ * Never returns if successful. Current kernel context is lost.
++ * The saved CS is mapped as follows:
++ *   RING0 -> RING3 kernel mode.
++ *   RING1 -> RING3 kernel mode.
++ *   RING2 -> RING3 kernel mode.
++ *   RING3 -> RING3 user mode.
++ * However RING0 indicates that the guest kernel should return to iteself
++ * directly with
++ *      orb   $3,1*8(%rsp)
++ *      iretq
++ * If flags contains VGCF_in_syscall:
++ *   Restore RAX, RIP, RFLAGS, RSP.
++ *   Discard R11, RCX, CS, SS.
++ * Otherwise:
++ *   Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
++ * All other registers are saved on hypercall entry and restored to user.
++ */
++/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
++#define _VGCF_in_syscall 8
++#define VGCF_in_syscall  (1<<_VGCF_in_syscall)
++#define VGCF_IN_SYSCALL  VGCF_in_syscall
++
++#ifndef __ASSEMBLY__
++
++struct iret_context {
++    /* Top of stack (%rsp at point of hypercall). */
++    uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
++    /* Bottom of iret stack frame. */
++};
++
++#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
++/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
++#define __DECL_REG(name) union { \
++    uint64_t r ## name, e ## name; \
++    uint32_t _e ## name; \
++}
++#else
++/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
++#define __DECL_REG(name) uint64_t r ## name
++#endif
++
++struct cpu_user_regs {
++    uint64_t r15;
++    uint64_t r14;
++    uint64_t r13;
++    uint64_t r12;
++    __DECL_REG(bp);
++    __DECL_REG(bx);
++    uint64_t r11;
++    uint64_t r10;
++    uint64_t r9;
++    uint64_t r8;
++    __DECL_REG(ax);
++    __DECL_REG(cx);
++    __DECL_REG(dx);
++    __DECL_REG(si);
++    __DECL_REG(di);
++    uint32_t error_code;    /* private */
++    uint32_t entry_vector;  /* private */
++    __DECL_REG(ip);
++    uint16_t cs, _pad0[1];
++    uint8_t  saved_upcall_mask;
++    uint8_t  _pad1[3];
++    __DECL_REG(flags);      /* rflags.IF == !saved_upcall_mask */
++    __DECL_REG(sp);
++    uint16_t ss, _pad2[3];
++    uint16_t es, _pad3[3];
++    uint16_t ds, _pad4[3];
++    uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base.     */
++    uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
++};
++typedef struct cpu_user_regs cpu_user_regs_t;
++DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
++
++#undef __DECL_REG
++
++#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
++#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
++
++struct arch_vcpu_info {
++    unsigned long cr2;
++    unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
++};
++typedef struct arch_vcpu_info arch_vcpu_info_t;
++
++typedef unsigned long xen_callback_t;
++
++#endif /* !__ASSEMBLY__ */
++
++#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/arch-x86/xen.h

index 0000000,0000000..b3141a4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen.h
@@@ -1,0 -1,0 +1,204 @@@
++/******************************************************************************
++ * arch-x86/xen.h
++ * 
++ * Guest OS interface to x86 Xen.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2004-2006, K A Fraser
++ */
++
++#include "../xen.h"
++
++#ifndef __XEN_PUBLIC_ARCH_X86_XEN_H__
++#define __XEN_PUBLIC_ARCH_X86_XEN_H__
++
++/* Structural guest handles introduced in 0x00030201. */
++#if __XEN_INTERFACE_VERSION__ >= 0x00030201
++#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
++    typedef struct { type *p; } __guest_handle_ ## name
++#else
++#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
++    typedef type * __guest_handle_ ## name
++#endif
++
++#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
++    ___DEFINE_XEN_GUEST_HANDLE(name, type);   \
++    ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type)
++#define DEFINE_XEN_GUEST_HANDLE(name)   __DEFINE_XEN_GUEST_HANDLE(name, name)
++#define __XEN_GUEST_HANDLE(name)        __guest_handle_ ## name
++#define XEN_GUEST_HANDLE(name)          __XEN_GUEST_HANDLE(name)
++#define set_xen_guest_handle_raw(hnd, val)  do { (hnd).p = val; } while (0)
++#ifdef __XEN_TOOLS__
++#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
++#endif
++#define set_xen_guest_handle(hnd, val) set_xen_guest_handle_raw(hnd, val)
++
++/* Allow co-existing Linux 2.6.23+ Xen interface definitions. */
++#define DEFINE_GUEST_HANDLE_STRUCT(name) struct name
++
++#if defined(__i386__)
++#include "xen-x86_32.h"
++#elif defined(__x86_64__)
++#include "xen-x86_64.h"
++#endif
++
++#ifndef __ASSEMBLY__
++typedef unsigned long xen_pfn_t;
++#define PRI_xen_pfn "lx"
++#endif
++
++/*
++ * SEGMENT DESCRIPTOR TABLES
++ */
++/*
++ * A number of GDT entries are reserved by Xen. These are not situated at the
++ * start of the GDT because some stupid OSes export hard-coded selector values
++ * in their ABI. These hard-coded values are always near the start of the GDT,
++ * so Xen places itself out of the way, at the far end of the GDT.
++ */
++#define FIRST_RESERVED_GDT_PAGE  14
++#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
++#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
++
++/* Maximum number of virtual CPUs in legacy multi-processor guests. */
++#define XEN_LEGACY_MAX_VCPUS 32
++
++#ifndef __ASSEMBLY__
++
++typedef unsigned long xen_ulong_t;
++
++/*
++ * Send an array of these to HYPERVISOR_set_trap_table().
++ * The privilege level specifies which modes may enter a trap via a software
++ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
++ * privilege levels as follows:
++ *  Level == 0: Noone may enter
++ *  Level == 1: Kernel may enter
++ *  Level == 2: Kernel may enter
++ *  Level == 3: Everyone may enter
++ */
++#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
++#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
++#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
++#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
++struct trap_info {
++    uint8_t       vector;  /* exception vector                              */
++    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
++    uint16_t      cs;      /* code selector                                 */
++    unsigned long address; /* code offset                                   */
++};
++typedef struct trap_info trap_info_t;
++DEFINE_XEN_GUEST_HANDLE(trap_info_t);
++
++typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
++
++/*
++ * The following is all CPU context. Note that the fpu_ctxt block is filled 
++ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
++ */
++struct vcpu_guest_context {
++    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
++    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
++#define VGCF_I387_VALID                (1<<0)
++#define VGCF_IN_KERNEL                 (1<<2)
++#define _VGCF_i387_valid               0
++#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
++#define _VGCF_in_kernel                2
++#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
++#define _VGCF_failsafe_disables_events 3
++#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
++#define _VGCF_syscall_disables_events  4
++#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events)
++#define _VGCF_online                   5
++#define VGCF_online                    (1<<_VGCF_online)
++    unsigned long flags;                    /* VGCF_* flags                 */
++    struct cpu_user_regs user_regs;         /* User-level CPU registers     */
++    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
++    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
++    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
++    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
++    /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
++    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
++    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
++#ifdef __i386__
++    unsigned long event_callback_cs;        /* CS:EIP of event callback     */
++    unsigned long event_callback_eip;
++    unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
++    unsigned long failsafe_callback_eip;
++#else
++    unsigned long event_callback_eip;
++    unsigned long failsafe_callback_eip;
++#ifdef __XEN__
++    union {
++        unsigned long syscall_callback_eip;
++        struct {
++            unsigned int event_callback_cs;    /* compat CS of event cb     */
++            unsigned int failsafe_callback_cs; /* compat CS of failsafe cb  */
++        };
++    };
++#else
++    unsigned long syscall_callback_eip;
++#endif
++#endif
++    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
++#ifdef __x86_64__
++    /* Segment base addresses. */
++    uint64_t      fs_base;
++    uint64_t      gs_base_kernel;
++    uint64_t      gs_base_user;
++#endif
++};
++typedef struct vcpu_guest_context vcpu_guest_context_t;
++DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
++
++struct arch_shared_info {
++    unsigned long max_pfn;                  /* max pfn that appears in table */
++    /* Frame containing list of mfns containing list of mfns containing p2m. */
++    xen_pfn_t     pfn_to_mfn_frame_list_list;
++    unsigned long nmi_reason;
++    uint64_t pad[32];
++};
++typedef struct arch_shared_info arch_shared_info_t;
++
++#endif /* !__ASSEMBLY__ */
++
++/*
++ * Prefix forces emulation of some non-trapping instructions.
++ * Currently only CPUID.
++ */
++#ifdef __ASSEMBLY__
++#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
++#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
++#else
++#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
++#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
++#endif
++
++#endif /* __XEN_PUBLIC_ARCH_X86_XEN_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/arch-x86_32.h

index 0000000,0000000..45842b2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/arch-x86_32.h
@@@ -1,0 -1,0 +1,27 @@@
++/******************************************************************************
++ * arch-x86_32.h
++ * 
++ * Guest OS interface to x86 32-bit Xen.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2004-2006, K A Fraser
++ */
++
++#include "arch-x86/xen.h"
diff --cc include/xen/interface/arch-x86_64.h

index 0000000,0000000..fbb2639

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/arch-x86_64.h
@@@ -1,0 -1,0 +1,27 @@@
++/******************************************************************************
++ * arch-x86_64.h
++ * 
++ * Guest OS interface to x86 64-bit Xen.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2004-2006, K A Fraser
++ */
++
++#include "arch-x86/xen.h"
diff --cc include/xen/interface/callback.h

index 2ae3cd2,2ae3cd2..0323e51
--- 1/include/xen/interface/callback.h
--- 2/include/xen/interface/callback.h
+++ b/include/xen/interface/callback.h
@@@ -86,6 -86,6 +86,8 @@@ struct callback_register 
         uint16_t flags;
         xen_callback_t address;
   };
++typedef struct callback_register callback_register_t;
++DEFINE_XEN_GUEST_HANDLE(callback_register_t);
   
   /*
    * Unregister a callback.
@@@ -98,5 -98,5 +100,12 @@@ struct callback_unregister 
       uint16_t type;
       uint16_t _unused;
   };
++typedef struct callback_unregister callback_unregister_t;
++DEFINE_XEN_GUEST_HANDLE(callback_unregister_t);
++
++#if __XEN_INTERFACE_VERSION__ < 0x00030207
++#undef CALLBACKTYPE_sysenter
++#define CALLBACKTYPE_sysenter CALLBACKTYPE_sysenter_deprecated
++#endif
   
   #endif /* __XEN_PUBLIC_CALLBACK_H__ */
diff --cc include/xen/interface/dom0_ops.h

index 0000000,0000000..5d2b324

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/dom0_ops.h
@@@ -1,0 -1,0 +1,120 @@@
++/******************************************************************************
++ * dom0_ops.h
++ * 
++ * Process command requests from domain-0 guest OS.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2002-2003, B Dragovic
++ * Copyright (c) 2002-2006, K Fraser
++ */
++
++#ifndef __XEN_PUBLIC_DOM0_OPS_H__
++#define __XEN_PUBLIC_DOM0_OPS_H__
++
++#include "xen.h"
++#include "platform.h"
++
++#if __XEN_INTERFACE_VERSION__ >= 0x00030204
++#error "dom0_ops.h is a compatibility interface only"
++#endif
++
++#define DOM0_INTERFACE_VERSION XENPF_INTERFACE_VERSION
++
++#define DOM0_SETTIME          XENPF_settime
++#define dom0_settime          xenpf_settime
++#define dom0_settime_t        xenpf_settime_t
++
++#define DOM0_ADD_MEMTYPE      XENPF_add_memtype
++#define dom0_add_memtype      xenpf_add_memtype
++#define dom0_add_memtype_t    xenpf_add_memtype_t
++
++#define DOM0_DEL_MEMTYPE      XENPF_del_memtype
++#define dom0_del_memtype      xenpf_del_memtype
++#define dom0_del_memtype_t    xenpf_del_memtype_t
++
++#define DOM0_READ_MEMTYPE     XENPF_read_memtype
++#define dom0_read_memtype     xenpf_read_memtype
++#define dom0_read_memtype_t   xenpf_read_memtype_t
++
++#define DOM0_MICROCODE        XENPF_microcode_update
++#define dom0_microcode        xenpf_microcode_update
++#define dom0_microcode_t      xenpf_microcode_update_t
++
++#define DOM0_PLATFORM_QUIRK   XENPF_platform_quirk
++#define dom0_platform_quirk   xenpf_platform_quirk
++#define dom0_platform_quirk_t xenpf_platform_quirk_t
++
++typedef uint64_t cpumap_t;
++
++/* Unsupported legacy operation -- defined for API compatibility. */
++#define DOM0_MSR                 15
++struct dom0_msr {
++    /* IN variables. */
++    uint32_t write;
++    cpumap_t cpu_mask;
++    uint32_t msr;
++    uint32_t in1;
++    uint32_t in2;
++    /* OUT variables. */
++    uint32_t out1;
++    uint32_t out2;
++};
++typedef struct dom0_msr dom0_msr_t;
++DEFINE_XEN_GUEST_HANDLE(dom0_msr_t);
++
++/* Unsupported legacy operation -- defined for API compatibility. */
++#define DOM0_PHYSICAL_MEMORY_MAP 40
++struct dom0_memory_map_entry {
++    uint64_t start, end;
++    uint32_t flags; /* reserved */
++    uint8_t  is_ram;
++};
++typedef struct dom0_memory_map_entry dom0_memory_map_entry_t;
++DEFINE_XEN_GUEST_HANDLE(dom0_memory_map_entry_t);
++
++struct dom0_op {
++    uint32_t cmd;
++    uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
++    union {
++        struct dom0_msr               msr;
++        struct dom0_settime           settime;
++        struct dom0_add_memtype       add_memtype;
++        struct dom0_del_memtype       del_memtype;
++        struct dom0_read_memtype      read_memtype;
++        struct dom0_microcode         microcode;
++        struct dom0_platform_quirk    platform_quirk;
++        struct dom0_memory_map_entry  physical_memory_map;
++        uint8_t                       pad[128];
++    } u;
++};
++typedef struct dom0_op dom0_op_t;
++DEFINE_XEN_GUEST_HANDLE(dom0_op_t);
++
++#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/domctl.h

index 0000000,0000000..f5c661b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/domctl.h
@@@ -1,0 -1,0 +1,968 @@@
++/******************************************************************************
++ * domctl.h
++ * 
++ * Domain management operations. For use by node control stack.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2002-2003, B Dragovic
++ * Copyright (c) 2002-2006, K Fraser
++ */
++
++#ifndef __XEN_PUBLIC_DOMCTL_H__
++#define __XEN_PUBLIC_DOMCTL_H__
++
++#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
++#error "domctl operations are intended for use by node control tools only"
++#endif
++
++#include "xen.h"
++#include "grant_table.h"
++
++#define XEN_DOMCTL_INTERFACE_VERSION 0x00000007
++
++/*
++ * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
++ * If it is specified as zero, an id is auto-allocated and returned.
++ */
++/* XEN_DOMCTL_createdomain */
++struct xen_domctl_createdomain {
++    /* IN parameters */
++    uint32_t ssidref;
++    xen_domain_handle_t handle;
++ /* Is this an HVM guest (as opposed to a PV guest)? */
++#define _XEN_DOMCTL_CDF_hvm_guest     0
++#define XEN_DOMCTL_CDF_hvm_guest      (1U<<_XEN_DOMCTL_CDF_hvm_guest)
++ /* Use hardware-assisted paging if available? */
++#define _XEN_DOMCTL_CDF_hap           1
++#define XEN_DOMCTL_CDF_hap            (1U<<_XEN_DOMCTL_CDF_hap)
++ /* Should domain memory integrity be verifed by tboot during Sx? */
++#define _XEN_DOMCTL_CDF_s3_integrity  2
++#define XEN_DOMCTL_CDF_s3_integrity   (1U<<_XEN_DOMCTL_CDF_s3_integrity)
++ /* Disable out-of-sync shadow page tables? */
++#define _XEN_DOMCTL_CDF_oos_off       3
++#define XEN_DOMCTL_CDF_oos_off        (1U<<_XEN_DOMCTL_CDF_oos_off)
++    uint32_t flags;
++};
++typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
++
++/* XEN_DOMCTL_getdomaininfo */
++struct xen_domctl_getdomaininfo {
++    /* OUT variables. */
++    domid_t  domain;              /* Also echoed in domctl.domain */
++ /* Domain is scheduled to die. */
++#define _XEN_DOMINF_dying     0
++#define XEN_DOMINF_dying      (1U<<_XEN_DOMINF_dying)
++ /* Domain is an HVM guest (as opposed to a PV guest). */
++#define _XEN_DOMINF_hvm_guest 1
++#define XEN_DOMINF_hvm_guest  (1U<<_XEN_DOMINF_hvm_guest)
++ /* The guest OS has shut down. */
++#define _XEN_DOMINF_shutdown  2
++#define XEN_DOMINF_shutdown   (1U<<_XEN_DOMINF_shutdown)
++ /* Currently paused by control software. */
++#define _XEN_DOMINF_paused    3
++#define XEN_DOMINF_paused     (1U<<_XEN_DOMINF_paused)
++ /* Currently blocked pending an event.     */
++#define _XEN_DOMINF_blocked   4
++#define XEN_DOMINF_blocked    (1U<<_XEN_DOMINF_blocked)
++ /* Domain is currently running.            */
++#define _XEN_DOMINF_running   5
++#define XEN_DOMINF_running    (1U<<_XEN_DOMINF_running)
++ /* Being debugged.  */
++#define _XEN_DOMINF_debugged  6
++#define XEN_DOMINF_debugged   (1U<<_XEN_DOMINF_debugged)
++ /* XEN_DOMINF_shutdown guest-supplied code.  */
++#define XEN_DOMINF_shutdownmask 255
++#define XEN_DOMINF_shutdownshift 16
++    uint32_t flags;              /* XEN_DOMINF_* */
++    uint64_aligned_t tot_pages;
++    uint64_aligned_t max_pages;
++    uint64_aligned_t shr_pages;
++    uint64_aligned_t shared_info_frame; /* GMFN of shared_info struct */
++    uint64_aligned_t cpu_time;
++    uint32_t nr_online_vcpus;    /* Number of VCPUs currently online. */
++    uint32_t max_vcpu_id;        /* Maximum VCPUID in use by this domain. */
++    uint32_t ssidref;
++    xen_domain_handle_t handle;
++    uint32_t cpupool;
++};
++typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
++
++
++/* XEN_DOMCTL_getmemlist */
++struct xen_domctl_getmemlist {
++    /* IN variables. */
++    /* Max entries to write to output buffer. */
++    uint64_aligned_t max_pfns;
++    /* Start index in guest's page list. */
++    uint64_aligned_t start_pfn;
++    XEN_GUEST_HANDLE_64(uint64) buffer;
++    /* OUT variables. */
++    uint64_aligned_t num_pfns;
++};
++typedef struct xen_domctl_getmemlist xen_domctl_getmemlist_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_getmemlist_t);
++
++
++/* XEN_DOMCTL_getpageframeinfo */
++
++#define XEN_DOMCTL_PFINFO_LTAB_SHIFT 28
++#define XEN_DOMCTL_PFINFO_NOTAB   (0x0U<<28)
++#define XEN_DOMCTL_PFINFO_L1TAB   (0x1U<<28)
++#define XEN_DOMCTL_PFINFO_L2TAB   (0x2U<<28)
++#define XEN_DOMCTL_PFINFO_L3TAB   (0x3U<<28)
++#define XEN_DOMCTL_PFINFO_L4TAB   (0x4U<<28)
++#define XEN_DOMCTL_PFINFO_LTABTYPE_MASK (0x7U<<28)
++#define XEN_DOMCTL_PFINFO_LPINTAB (0x1U<<31)
++#define XEN_DOMCTL_PFINFO_XTAB    (0xfU<<28) /* invalid page */
++#define XEN_DOMCTL_PFINFO_PAGEDTAB (0x8U<<28)
++#define XEN_DOMCTL_PFINFO_LTAB_MASK (0xfU<<28)
++
++struct xen_domctl_getpageframeinfo {
++    /* IN variables. */
++    uint64_aligned_t gmfn; /* GMFN to query */
++    /* OUT variables. */
++    /* Is the page PINNED to a type? */
++    uint32_t type;         /* see above type defs */
++};
++typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t);
++
++
++/* XEN_DOMCTL_getpageframeinfo2 */
++struct xen_domctl_getpageframeinfo2 {
++    /* IN variables. */
++    uint64_aligned_t num;
++    /* IN/OUT variables. */
++    XEN_GUEST_HANDLE_64(uint32) array;
++};
++typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t);
++
++/* XEN_DOMCTL_getpageframeinfo3 */
++struct xen_domctl_getpageframeinfo3 {
++    /* IN variables. */
++    uint64_aligned_t num;
++    /* IN/OUT variables. */
++    XEN_GUEST_HANDLE_64(xen_pfn_t) array;
++};
++
++
++/*
++ * Control shadow pagetables operation
++ */
++/* XEN_DOMCTL_shadow_op */
++
++/* Disable shadow mode. */
++#define XEN_DOMCTL_SHADOW_OP_OFF         0
++
++/* Enable shadow mode (mode contains ORed XEN_DOMCTL_SHADOW_ENABLE_* flags). */
++#define XEN_DOMCTL_SHADOW_OP_ENABLE      32
++
++/* Log-dirty bitmap operations. */
++ /* Return the bitmap and clean internal copy for next round. */
++#define XEN_DOMCTL_SHADOW_OP_CLEAN       11
++ /* Return the bitmap but do not modify internal copy. */
++#define XEN_DOMCTL_SHADOW_OP_PEEK        12
++
++/* Memory allocation accessors. */
++#define XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION   30
++#define XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION   31
++
++/* Legacy enable operations. */
++ /* Equiv. to ENABLE with no mode flags. */
++#define XEN_DOMCTL_SHADOW_OP_ENABLE_TEST       1
++ /* Equiv. to ENABLE with mode flag ENABLE_LOG_DIRTY. */
++#define XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY   2
++ /* Equiv. to ENABLE with mode flags ENABLE_REFCOUNT and ENABLE_TRANSLATE. */
++#define XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE  3
++
++/* Mode flags for XEN_DOMCTL_SHADOW_OP_ENABLE. */
++ /*
++  * Shadow pagetables are refcounted: guest does not use explicit mmu
++  * operations nor write-protect its pagetables.
++  */
++#define XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT  (1 << 1)
++ /*
++  * Log pages in a bitmap as they are dirtied.
++  * Used for live relocation to determine which pages must be re-sent.
++  */
++#define XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY (1 << 2)
++ /*
++  * Automatically translate GPFNs into MFNs.
++  */
++#define XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE (1 << 3)
++ /*
++  * Xen does not steal virtual address space from the guest.
++  * Requires HVM support.
++  */
++#define XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL  (1 << 4)
++
++struct xen_domctl_shadow_op_stats {
++    uint32_t fault_count;
++    uint32_t dirty_count;
++};
++typedef struct xen_domctl_shadow_op_stats xen_domctl_shadow_op_stats_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_stats_t);
++
++struct xen_domctl_shadow_op {
++    /* IN variables. */
++    uint32_t       op;       /* XEN_DOMCTL_SHADOW_OP_* */
++
++    /* OP_ENABLE */
++    uint32_t       mode;     /* XEN_DOMCTL_SHADOW_ENABLE_* */
++
++    /* OP_GET_ALLOCATION / OP_SET_ALLOCATION */
++    uint32_t       mb;       /* Shadow memory allocation in MB */
++
++    /* OP_PEEK / OP_CLEAN */
++    XEN_GUEST_HANDLE_64(uint8) dirty_bitmap;
++    uint64_aligned_t pages; /* Size of buffer. Updated with actual size. */
++    struct xen_domctl_shadow_op_stats stats;
++};
++typedef struct xen_domctl_shadow_op xen_domctl_shadow_op_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_t);
++
++
++/* XEN_DOMCTL_max_mem */
++struct xen_domctl_max_mem {
++    /* IN variables. */
++    uint64_aligned_t max_memkb;
++};
++typedef struct xen_domctl_max_mem xen_domctl_max_mem_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_mem_t);
++
++
++/* XEN_DOMCTL_setvcpucontext */
++/* XEN_DOMCTL_getvcpucontext */
++struct xen_domctl_vcpucontext {
++    uint32_t              vcpu;                  /* IN */
++    XEN_GUEST_HANDLE_64(vcpu_guest_context_t) ctxt; /* IN/OUT */
++};
++typedef struct xen_domctl_vcpucontext xen_domctl_vcpucontext_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpucontext_t);
++
++
++/* XEN_DOMCTL_getvcpuinfo */
++struct xen_domctl_getvcpuinfo {
++    /* IN variables. */
++    uint32_t vcpu;
++    /* OUT variables. */
++    uint8_t  online;                  /* currently online (not hotplugged)? */
++    uint8_t  blocked;                 /* blocked waiting for an event? */
++    uint8_t  running;                 /* currently scheduled on its CPU? */
++    uint64_aligned_t cpu_time;        /* total cpu time consumed (ns) */
++    uint32_t cpu;                     /* current mapping   */
++};
++typedef struct xen_domctl_getvcpuinfo xen_domctl_getvcpuinfo_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t);
++
++
++/* Get/set which physical cpus a vcpu can execute on. */
++/* XEN_DOMCTL_setvcpuaffinity */
++/* XEN_DOMCTL_getvcpuaffinity */
++struct xen_domctl_vcpuaffinity {
++    uint32_t  vcpu;              /* IN */
++    struct xenctl_cpumap cpumap; /* IN/OUT */
++};
++typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t);
++
++
++/* XEN_DOMCTL_max_vcpus */
++struct xen_domctl_max_vcpus {
++    uint32_t max;           /* maximum number of vcpus */
++};
++typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
++
++
++/* XEN_DOMCTL_scheduler_op */
++/* Scheduler types. */
++#define XEN_SCHEDULER_SEDF     4
++#define XEN_SCHEDULER_CREDIT   5
++#define XEN_SCHEDULER_CREDIT2  6
++#define XEN_SCHEDULER_ARINC653 7
++/* Set or get info? */
++#define XEN_DOMCTL_SCHEDOP_putinfo 0
++#define XEN_DOMCTL_SCHEDOP_getinfo 1
++struct xen_domctl_scheduler_op {
++    uint32_t sched_id;  /* XEN_SCHEDULER_* */
++    uint32_t cmd;       /* XEN_DOMCTL_SCHEDOP_* */
++    union {
++        struct xen_domctl_sched_sedf {
++            uint64_aligned_t period;
++            uint64_aligned_t slice;
++            uint64_aligned_t latency;
++            uint32_t extratime;
++            uint32_t weight;
++        } sedf;
++        struct xen_domctl_sched_credit {
++            uint16_t weight;
++            uint16_t cap;
++        } credit;
++        struct xen_domctl_sched_credit2 {
++            uint16_t weight;
++        } credit2;
++    } u;
++};
++typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_scheduler_op_t);
++
++
++/* XEN_DOMCTL_setdomainhandle */
++struct xen_domctl_setdomainhandle {
++    xen_domain_handle_t handle;
++};
++typedef struct xen_domctl_setdomainhandle xen_domctl_setdomainhandle_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdomainhandle_t);
++
++
++/* XEN_DOMCTL_setdebugging */
++struct xen_domctl_setdebugging {
++    uint8_t enable;
++};
++typedef struct xen_domctl_setdebugging xen_domctl_setdebugging_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdebugging_t);
++
++
++/* XEN_DOMCTL_irq_permission */
++struct xen_domctl_irq_permission {
++    uint8_t pirq;
++    uint8_t allow_access;    /* flag to specify enable/disable of IRQ access */
++};
++typedef struct xen_domctl_irq_permission xen_domctl_irq_permission_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_irq_permission_t);
++
++
++/* XEN_DOMCTL_iomem_permission */
++struct xen_domctl_iomem_permission {
++    uint64_aligned_t first_mfn;/* first page (physical page number) in range */
++    uint64_aligned_t nr_mfns;  /* number of pages in range (>0) */
++    uint8_t  allow_access;     /* allow (!0) or deny (0) access to range? */
++};
++typedef struct xen_domctl_iomem_permission xen_domctl_iomem_permission_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_iomem_permission_t);
++
++
++/* XEN_DOMCTL_ioport_permission */
++struct xen_domctl_ioport_permission {
++    uint32_t first_port;              /* first port int range */
++    uint32_t nr_ports;                /* size of port range */
++    uint8_t  allow_access;            /* allow or deny access to range? */
++};
++typedef struct xen_domctl_ioport_permission xen_domctl_ioport_permission_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_permission_t);
++
++
++/* XEN_DOMCTL_hypercall_init */
++struct xen_domctl_hypercall_init {
++    uint64_aligned_t  gmfn;           /* GMFN to be initialised */
++};
++typedef struct xen_domctl_hypercall_init xen_domctl_hypercall_init_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t);
++
++
++/* XEN_DOMCTL_arch_setup */
++#define _XEN_DOMAINSETUP_hvm_guest 0
++#define XEN_DOMAINSETUP_hvm_guest  (1UL<<_XEN_DOMAINSETUP_hvm_guest)
++#define _XEN_DOMAINSETUP_query 1 /* Get parameters (for save)  */
++#define XEN_DOMAINSETUP_query  (1UL<<_XEN_DOMAINSETUP_query)
++#define _XEN_DOMAINSETUP_sioemu_guest 2
++#define XEN_DOMAINSETUP_sioemu_guest  (1UL<<_XEN_DOMAINSETUP_sioemu_guest)
++typedef struct xen_domctl_arch_setup {
++    uint64_aligned_t flags;  /* XEN_DOMAINSETUP_* */
++#ifdef __ia64__
++    uint64_aligned_t bp;     /* mpaddr of boot param area */
++    uint64_aligned_t maxmem; /* Highest memory address for MDT.  */
++    uint64_aligned_t xsi_va; /* Xen shared_info area virtual address.  */
++    uint32_t hypercall_imm;  /* Break imm for Xen hypercalls.  */
++    int8_t vhpt_size_log2;   /* Log2 of VHPT size. */
++#endif
++} xen_domctl_arch_setup_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_arch_setup_t);
++
++
++/* XEN_DOMCTL_settimeoffset */
++struct xen_domctl_settimeoffset {
++    int32_t  time_offset_seconds; /* applied to domain wallclock time */
++};
++typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
++
++/* XEN_DOMCTL_gethvmcontext */
++/* XEN_DOMCTL_sethvmcontext */
++typedef struct xen_domctl_hvmcontext {
++    uint32_t size; /* IN/OUT: size of buffer / bytes filled */
++    XEN_GUEST_HANDLE_64(uint8) buffer; /* IN/OUT: data, or call
++                                        * gethvmcontext with NULL
++                                        * buffer to get size req'd */
++} xen_domctl_hvmcontext_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_t);
++
++
++/* XEN_DOMCTL_set_address_size */
++/* XEN_DOMCTL_get_address_size */
++typedef struct xen_domctl_address_size {
++    uint32_t size;
++} xen_domctl_address_size_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_address_size_t);
++
++
++/* XEN_DOMCTL_real_mode_area */
++struct xen_domctl_real_mode_area {
++    uint32_t log; /* log2 of Real Mode Area size */
++};
++typedef struct xen_domctl_real_mode_area xen_domctl_real_mode_area_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_real_mode_area_t);
++
++
++/* XEN_DOMCTL_sendtrigger */
++#define XEN_DOMCTL_SENDTRIGGER_NMI    0
++#define XEN_DOMCTL_SENDTRIGGER_RESET  1
++#define XEN_DOMCTL_SENDTRIGGER_INIT   2
++#define XEN_DOMCTL_SENDTRIGGER_POWER  3
++#define XEN_DOMCTL_SENDTRIGGER_SLEEP  4
++struct xen_domctl_sendtrigger {
++    uint32_t  trigger;  /* IN */
++    uint32_t  vcpu;     /* IN */
++};
++typedef struct xen_domctl_sendtrigger xen_domctl_sendtrigger_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_sendtrigger_t);
++
++
++/* Assign PCI device to HVM guest. Sets up IOMMU structures. */
++/* XEN_DOMCTL_assign_device */
++/* XEN_DOMCTL_test_assign_device */
++/* XEN_DOMCTL_deassign_device */
++struct xen_domctl_assign_device {
++    uint32_t  machine_bdf;   /* machine PCI ID of assigned device */
++};
++typedef struct xen_domctl_assign_device xen_domctl_assign_device_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_assign_device_t);
++
++/* Retrieve sibling devices infomation of machine_bdf */
++/* XEN_DOMCTL_get_device_group */
++struct xen_domctl_get_device_group {
++    uint32_t  machine_bdf;      /* IN */
++    uint32_t  max_sdevs;        /* IN */
++    uint32_t  num_sdevs;        /* OUT */
++    XEN_GUEST_HANDLE_64(uint32)  sdev_array;   /* OUT */
++};
++typedef struct xen_domctl_get_device_group xen_domctl_get_device_group_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_get_device_group_t);
++
++/* Pass-through interrupts: bind real irq -> hvm devfn. */
++/* XEN_DOMCTL_bind_pt_irq */
++/* XEN_DOMCTL_unbind_pt_irq */
++typedef enum pt_irq_type_e {
++    PT_IRQ_TYPE_PCI,
++    PT_IRQ_TYPE_ISA,
++    PT_IRQ_TYPE_MSI,
++    PT_IRQ_TYPE_MSI_TRANSLATE,
++} pt_irq_type_t;
++struct xen_domctl_bind_pt_irq {
++    uint32_t machine_irq;
++    pt_irq_type_t irq_type;
++    uint32_t hvm_domid;
++
++    union {
++        struct {
++            uint8_t isa_irq;
++        } isa;
++        struct {
++            uint8_t bus;
++            uint8_t device;
++            uint8_t intx;
++        } pci;
++        struct {
++            uint8_t gvec;
++            uint32_t gflags;
++            uint64_aligned_t gtable;
++        } msi;
++    } u;
++};
++typedef struct xen_domctl_bind_pt_irq xen_domctl_bind_pt_irq_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_pt_irq_t);
++
++
++/* Bind machine I/O address range -> HVM address range. */
++/* XEN_DOMCTL_memory_mapping */
++#define DPCI_ADD_MAPPING         1
++#define DPCI_REMOVE_MAPPING      0
++struct xen_domctl_memory_mapping {
++    uint64_aligned_t first_gfn; /* first page (hvm guest phys page) in range */
++    uint64_aligned_t first_mfn; /* first page (machine page) in range */
++    uint64_aligned_t nr_mfns;   /* number of pages in range (>0) */
++    uint32_t add_mapping;       /* add or remove mapping */
++    uint32_t padding;           /* padding for 64-bit aligned structure */
++};
++typedef struct xen_domctl_memory_mapping xen_domctl_memory_mapping_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_memory_mapping_t);
++
++
++/* Bind machine I/O port range -> HVM I/O port range. */
++/* XEN_DOMCTL_ioport_mapping */
++struct xen_domctl_ioport_mapping {
++    uint32_t first_gport;     /* first guest IO port*/
++    uint32_t first_mport;     /* first machine IO port */
++    uint32_t nr_ports;        /* size of port range */
++    uint32_t add_mapping;     /* add or remove mapping */
++};
++typedef struct xen_domctl_ioport_mapping xen_domctl_ioport_mapping_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_mapping_t);
++
++
++/*
++ * Pin caching type of RAM space for x86 HVM domU.
++ */
++/* XEN_DOMCTL_pin_mem_cacheattr */
++/* Caching types: these happen to be the same as x86 MTRR/PAT type codes. */
++#define XEN_DOMCTL_MEM_CACHEATTR_UC  0
++#define XEN_DOMCTL_MEM_CACHEATTR_WC  1
++#define XEN_DOMCTL_MEM_CACHEATTR_WT  4
++#define XEN_DOMCTL_MEM_CACHEATTR_WP  5
++#define XEN_DOMCTL_MEM_CACHEATTR_WB  6
++#define XEN_DOMCTL_MEM_CACHEATTR_UCM 7
++struct xen_domctl_pin_mem_cacheattr {
++    uint64_aligned_t start, end;
++    uint32_t type; /* XEN_DOMCTL_MEM_CACHEATTR_* */
++};
++typedef struct xen_domctl_pin_mem_cacheattr xen_domctl_pin_mem_cacheattr_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t);
++
++
++/* XEN_DOMCTL_set_ext_vcpucontext */
++/* XEN_DOMCTL_get_ext_vcpucontext */
++struct xen_domctl_ext_vcpucontext {
++    /* IN: VCPU that this call applies to. */
++    uint32_t         vcpu;
++    /*
++     * SET: Size of struct (IN)
++     * GET: Size of struct (OUT)
++     */
++    uint32_t         size;
++#if defined(__i386__) || defined(__x86_64__)
++    /* SYSCALL from 32-bit mode and SYSENTER callback information. */
++    /* NB. SYSCALL from 64-bit mode is contained in vcpu_guest_context_t */
++    uint64_aligned_t syscall32_callback_eip;
++    uint64_aligned_t sysenter_callback_eip;
++    uint16_t         syscall32_callback_cs;
++    uint16_t         sysenter_callback_cs;
++    uint8_t          syscall32_disables_events;
++    uint8_t          sysenter_disables_events;
++#endif
++};
++typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpucontext_t);
++
++/*
++ * Set optimizaton features for a domain
++ */
++/* XEN_DOMCTL_set_opt_feature */
++struct xen_domctl_set_opt_feature {
++#if defined(__ia64__)
++    struct xen_ia64_opt_feature optf;
++#else
++    /* Make struct non-empty: do not depend on this field name! */
++    uint64_t dummy;
++#endif
++};
++typedef struct xen_domctl_set_opt_feature xen_domctl_set_opt_feature_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_opt_feature_t);
++
++/*
++ * Set the target domain for a domain
++ */
++/* XEN_DOMCTL_set_target */
++struct xen_domctl_set_target {
++    domid_t target;
++};
++typedef struct xen_domctl_set_target xen_domctl_set_target_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_target_t);
++
++#if defined(__i386__) || defined(__x86_64__)
++# define XEN_CPUID_INPUT_UNUSED  0xFFFFFFFF
++/* XEN_DOMCTL_set_cpuid */
++struct xen_domctl_cpuid {
++  uint32_t input[2];
++  uint32_t eax;
++  uint32_t ebx;
++  uint32_t ecx;
++  uint32_t edx;
++};
++typedef struct xen_domctl_cpuid xen_domctl_cpuid_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpuid_t);
++#endif
++
++/* XEN_DOMCTL_subscribe */
++struct xen_domctl_subscribe {
++    uint32_t port; /* IN */
++};
++typedef struct xen_domctl_subscribe xen_domctl_subscribe_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_subscribe_t);
++
++/*
++ * Define the maximum machine address size which should be allocated
++ * to a guest.
++ */
++/* XEN_DOMCTL_set_machine_address_size */
++/* XEN_DOMCTL_get_machine_address_size */
++
++/*
++ * Do not inject spurious page faults into this domain.
++ */
++/* XEN_DOMCTL_suppress_spurious_page_faults */
++
++/* XEN_DOMCTL_debug_op */
++#define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF         0
++#define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON          1
++struct xen_domctl_debug_op {
++    uint32_t op;   /* IN */
++    uint32_t vcpu; /* IN */
++};
++typedef struct xen_domctl_debug_op xen_domctl_debug_op_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_debug_op_t);
++
++/*
++ * Request a particular record from the HVM context
++ */
++/* XEN_DOMCTL_gethvmcontext_partial */
++typedef struct xen_domctl_hvmcontext_partial {
++    uint32_t type;                      /* IN: Type of record required */
++    uint32_t instance;                  /* IN: Instance of that type */
++    XEN_GUEST_HANDLE_64(uint8) buffer;  /* OUT: buffer to write record into */
++} xen_domctl_hvmcontext_partial_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t);
++
++/* XEN_DOMCTL_disable_migrate */
++typedef struct xen_domctl_disable_migrate {
++    uint32_t disable; /* IN: 1: disable migration and restore */
++} xen_domctl_disable_migrate_t;
++
++
++/* XEN_DOMCTL_gettscinfo */
++/* XEN_DOMCTL_settscinfo */
++struct xen_guest_tsc_info {
++    uint32_t tsc_mode;
++    uint32_t gtsc_khz;
++    uint32_t incarnation;
++    uint32_t pad;
++    uint64_aligned_t elapsed_nsec;
++};
++typedef struct xen_guest_tsc_info xen_guest_tsc_info_t;
++DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t);
++typedef struct xen_domctl_tsc_info {
++    XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */
++    xen_guest_tsc_info_t info; /* IN */
++} xen_domctl_tsc_info_t;
++
++/* XEN_DOMCTL_gdbsx_guestmemio      guest mem io */
++struct xen_domctl_gdbsx_memio {
++    /* IN */
++    uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */
++    uint64_aligned_t gva;    /* guest virtual address */
++    uint64_aligned_t uva;    /* user buffer virtual address */
++    uint32_t         len;    /* number of bytes to read/write */
++    uint8_t          gwr;    /* 0 = read from guest. 1 = write to guest */
++    /* OUT */
++    uint32_t         remain; /* bytes remaining to be copied */
++};
++
++/* XEN_DOMCTL_gdbsx_pausevcpu */
++/* XEN_DOMCTL_gdbsx_unpausevcpu */
++struct xen_domctl_gdbsx_pauseunp_vcpu { /* pause/unpause a vcpu */
++    uint32_t         vcpu;         /* which vcpu */
++};
++
++/* XEN_DOMCTL_gdbsx_domstatus */
++struct xen_domctl_gdbsx_domstatus {
++    /* OUT */
++    uint8_t          paused;     /* is the domain paused */
++    uint32_t         vcpu_id;    /* any vcpu in an event? */
++    uint32_t         vcpu_ev;    /* if yes, what event? */
++};
++
++/*
++ * Memory event operations
++ */
++
++/* XEN_DOMCTL_mem_event_op */
++
++/* Add and remove memory handlers */
++#define XEN_DOMCTL_MEM_EVENT_OP_ENABLE     0
++#define XEN_DOMCTL_MEM_EVENT_OP_DISABLE    1
++
++/*
++ * Page memory in and out. 
++ */
++#define XEN_DOMCTL_MEM_EVENT_OP_PAGING            1
++
++/* Domain memory paging */
++#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_NOMINATE   0
++#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_EVICT      1
++#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_PREP       2
++#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_RESUME     3
++
++/*
++ * Access permissions.
++ *
++ * There are HVM hypercalls to set the per-page access permissions of every
++ * page in a domain.  When one of these permissions--independent, read, 
++ * write, and execute--is violated, the VCPU is paused and a memory event 
++ * is sent with what happened.  (See public/mem_event.h)  The memory event 
++ * handler can then resume the VCPU and redo the access with an 
++ * ACCESS_RESUME mode for the following domctl.
++ */
++#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS            2
++#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_RESUME     0 
++
++struct xen_domctl_mem_event_op {
++    uint32_t       op;           /* XEN_DOMCTL_MEM_EVENT_OP_* */
++    uint32_t       mode;         /* XEN_DOMCTL_MEM_EVENT_ENABLE_* */
++
++    /* OP_ENABLE */
++    uint64_aligned_t shared_addr;  /* IN:  Virtual address of shared page */
++    uint64_aligned_t ring_addr;    /* IN:  Virtual address of ring page */
++
++    /* Other OPs */
++    uint64_aligned_t gfn;          /* IN:  gfn of page being operated on */
++};
++typedef struct xen_domctl_mem_event_op xen_domctl_mem_event_op_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_event_op_t);
++
++/*
++ * Memory sharing operations
++ */
++/* XEN_DOMCTL_mem_sharing_op */
++
++#define XEN_DOMCTL_MEM_SHARING_OP_CONTROL        0
++#define XEN_DOMCTL_MEM_SHARING_OP_NOMINATE_GFN   1
++#define XEN_DOMCTL_MEM_SHARING_OP_NOMINATE_GREF  2
++#define XEN_DOMCTL_MEM_SHARING_OP_SHARE          3
++#define XEN_DOMCTL_MEM_SHARING_OP_RESUME         4
++#define XEN_DOMCTL_MEM_SHARING_OP_DEBUG_GFN      5
++#define XEN_DOMCTL_MEM_SHARING_OP_DEBUG_MFN      6
++#define XEN_DOMCTL_MEM_SHARING_OP_DEBUG_GREF     7
++
++#define XEN_DOMCTL_MEM_SHARING_S_HANDLE_INVALID  (-10)
++#define XEN_DOMCTL_MEM_SHARING_C_HANDLE_INVALID  (-9)
++
++struct xen_domctl_mem_sharing_op {
++    uint8_t op; /* XEN_DOMCTL_MEM_EVENT_OP_* */
++
++    union {
++        uint8_t enable;                   /* OP_CONTROL                */
++
++        struct mem_sharing_op_nominate {  /* OP_NOMINATE_xxx           */
++            union {
++                uint64_aligned_t gfn;     /* IN: gfn to nominate       */
++                uint32_t      grant_ref;  /* IN: grant ref to nominate */
++            } u;
++            uint64_aligned_t  handle;     /* OUT: the handle           */
++        } nominate;
++        struct mem_sharing_op_share {     /* OP_SHARE */
++            uint64_aligned_t source_handle; /* IN: handle to the source page */
++            uint64_aligned_t client_handle; /* IN: handle to the client page */
++        } share; 
++        struct mem_sharing_op_debug {     /* OP_DEBUG_xxx */
++            union {
++                uint64_aligned_t gfn;      /* IN: gfn to debug          */
++                uint64_aligned_t mfn;      /* IN: mfn to debug          */
++                grant_ref_t    gref;       /* IN: gref to debug         */
++            } u;
++        } debug;
++    } u;
++};
++typedef struct xen_domctl_mem_sharing_op xen_domctl_mem_sharing_op_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_sharing_op_t);
++
++#if defined(__i386__) || defined(__x86_64__)
++/* XEN_DOMCTL_setvcpuextstate */
++/* XEN_DOMCTL_getvcpuextstate */
++struct xen_domctl_vcpuextstate {
++    /* IN: VCPU that this call applies to. */
++    uint32_t         vcpu;
++    /*
++     * SET: xfeature support mask of struct (IN)
++     * GET: xfeature support mask of struct (IN/OUT)
++     * xfeature mask is served as identifications of the saving format
++     * so that compatible CPUs can have a check on format to decide
++     * whether it can restore.
++     */
++    uint64_aligned_t         xfeature_mask;
++    /*
++     * SET: Size of struct (IN)
++     * GET: Size of struct (IN/OUT)
++     */
++    uint64_aligned_t         size;
++    XEN_GUEST_HANDLE_64(uint64) buffer;
++};
++typedef struct xen_domctl_vcpuextstate xen_domctl_vcpuextstate_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuextstate_t);
++#endif
++
++/* XEN_DOMCTL_set_access_required: sets whether a memory event listener
++ * must be present to handle page access events: if false, the page
++ * access will revert to full permissions if no one is listening;
++ *  */
++struct xen_domctl_set_access_required {
++    uint8_t access_required;
++};
++typedef struct xen_domctl_set_access_required xen_domctl_set_access_required_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_access_required_t);
++
++struct xen_domctl {
++    uint32_t cmd;
++#define XEN_DOMCTL_createdomain                   1
++#define XEN_DOMCTL_destroydomain                  2
++#define XEN_DOMCTL_pausedomain                    3
++#define XEN_DOMCTL_unpausedomain                  4
++#define XEN_DOMCTL_getdomaininfo                  5
++#define XEN_DOMCTL_getmemlist                     6
++#define XEN_DOMCTL_getpageframeinfo               7
++#define XEN_DOMCTL_getpageframeinfo2              8
++#define XEN_DOMCTL_setvcpuaffinity                9
++#define XEN_DOMCTL_shadow_op                     10
++#define XEN_DOMCTL_max_mem                       11
++#define XEN_DOMCTL_setvcpucontext                12
++#define XEN_DOMCTL_getvcpucontext                13
++#define XEN_DOMCTL_getvcpuinfo                   14
++#define XEN_DOMCTL_max_vcpus                     15
++#define XEN_DOMCTL_scheduler_op                  16
++#define XEN_DOMCTL_setdomainhandle               17
++#define XEN_DOMCTL_setdebugging                  18
++#define XEN_DOMCTL_irq_permission                19
++#define XEN_DOMCTL_iomem_permission              20
++#define XEN_DOMCTL_ioport_permission             21
++#define XEN_DOMCTL_hypercall_init                22
++#define XEN_DOMCTL_arch_setup                    23
++#define XEN_DOMCTL_settimeoffset                 24
++#define XEN_DOMCTL_getvcpuaffinity               25
++#define XEN_DOMCTL_real_mode_area                26
++#define XEN_DOMCTL_resumedomain                  27
++#define XEN_DOMCTL_sendtrigger                   28
++#define XEN_DOMCTL_subscribe                     29
++#define XEN_DOMCTL_gethvmcontext                 33
++#define XEN_DOMCTL_sethvmcontext                 34
++#define XEN_DOMCTL_set_address_size              35
++#define XEN_DOMCTL_get_address_size              36
++#define XEN_DOMCTL_assign_device                 37
++#define XEN_DOMCTL_bind_pt_irq                   38
++#define XEN_DOMCTL_memory_mapping                39
++#define XEN_DOMCTL_ioport_mapping                40
++#define XEN_DOMCTL_pin_mem_cacheattr             41
++#define XEN_DOMCTL_set_ext_vcpucontext           42
++#define XEN_DOMCTL_get_ext_vcpucontext           43
++#define XEN_DOMCTL_set_opt_feature               44
++#define XEN_DOMCTL_test_assign_device            45
++#define XEN_DOMCTL_set_target                    46
++#define XEN_DOMCTL_deassign_device               47
++#define XEN_DOMCTL_unbind_pt_irq                 48
++#define XEN_DOMCTL_set_cpuid                     49
++#define XEN_DOMCTL_get_device_group              50
++#define XEN_DOMCTL_set_machine_address_size      51
++#define XEN_DOMCTL_get_machine_address_size      52
++#define XEN_DOMCTL_suppress_spurious_page_faults 53
++#define XEN_DOMCTL_debug_op                      54
++#define XEN_DOMCTL_gethvmcontext_partial         55
++#define XEN_DOMCTL_mem_event_op                  56
++#define XEN_DOMCTL_mem_sharing_op                57
++#define XEN_DOMCTL_disable_migrate               58
++#define XEN_DOMCTL_gettscinfo                    59
++#define XEN_DOMCTL_settscinfo                    60
++#define XEN_DOMCTL_getpageframeinfo3             61
++#define XEN_DOMCTL_setvcpuextstate               62
++#define XEN_DOMCTL_getvcpuextstate               63
++#define XEN_DOMCTL_set_access_required           64
++#define XEN_DOMCTL_gdbsx_guestmemio            1000
++#define XEN_DOMCTL_gdbsx_pausevcpu             1001
++#define XEN_DOMCTL_gdbsx_unpausevcpu           1002
++#define XEN_DOMCTL_gdbsx_domstatus             1003
++    uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
++    domid_t  domain;
++    union {
++        struct xen_domctl_createdomain      createdomain;
++        struct xen_domctl_getdomaininfo     getdomaininfo;
++        struct xen_domctl_getmemlist        getmemlist;
++        struct xen_domctl_getpageframeinfo  getpageframeinfo;
++        struct xen_domctl_getpageframeinfo2 getpageframeinfo2;
++        struct xen_domctl_getpageframeinfo3 getpageframeinfo3;
++        struct xen_domctl_vcpuaffinity      vcpuaffinity;
++        struct xen_domctl_shadow_op         shadow_op;
++        struct xen_domctl_max_mem           max_mem;
++        struct xen_domctl_vcpucontext       vcpucontext;
++        struct xen_domctl_getvcpuinfo       getvcpuinfo;
++        struct xen_domctl_max_vcpus         max_vcpus;
++        struct xen_domctl_scheduler_op      scheduler_op;
++        struct xen_domctl_setdomainhandle   setdomainhandle;
++        struct xen_domctl_setdebugging      setdebugging;
++        struct xen_domctl_irq_permission    irq_permission;
++        struct xen_domctl_iomem_permission  iomem_permission;
++        struct xen_domctl_ioport_permission ioport_permission;
++        struct xen_domctl_hypercall_init    hypercall_init;
++        struct xen_domctl_arch_setup        arch_setup;
++        struct xen_domctl_settimeoffset     settimeoffset;
++        struct xen_domctl_disable_migrate   disable_migrate;
++        struct xen_domctl_tsc_info          tsc_info;
++        struct xen_domctl_real_mode_area    real_mode_area;
++        struct xen_domctl_hvmcontext        hvmcontext;
++        struct xen_domctl_hvmcontext_partial hvmcontext_partial;
++        struct xen_domctl_address_size      address_size;
++        struct xen_domctl_sendtrigger       sendtrigger;
++        struct xen_domctl_get_device_group  get_device_group;
++        struct xen_domctl_assign_device     assign_device;
++        struct xen_domctl_bind_pt_irq       bind_pt_irq;
++        struct xen_domctl_memory_mapping    memory_mapping;
++        struct xen_domctl_ioport_mapping    ioport_mapping;
++        struct xen_domctl_pin_mem_cacheattr pin_mem_cacheattr;
++        struct xen_domctl_ext_vcpucontext   ext_vcpucontext;
++        struct xen_domctl_set_opt_feature   set_opt_feature;
++        struct xen_domctl_set_target        set_target;
++        struct xen_domctl_subscribe         subscribe;
++        struct xen_domctl_debug_op          debug_op;
++        struct xen_domctl_mem_event_op      mem_event_op;
++        struct xen_domctl_mem_sharing_op    mem_sharing_op;
++#if defined(__i386__) || defined(__x86_64__)
++        struct xen_domctl_cpuid             cpuid;
++        struct xen_domctl_vcpuextstate      vcpuextstate;
++#endif
++        struct xen_domctl_set_access_required access_required;
++        struct xen_domctl_gdbsx_memio       gdbsx_guest_memio;
++        struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
++        struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
++        uint8_t                             pad[128];
++    } u;
++};
++typedef struct xen_domctl xen_domctl_t;
++DEFINE_XEN_GUEST_HANDLE(xen_domctl_t);
++
++#endif /* __XEN_PUBLIC_DOMCTL_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/elfnote.h

index 0360b15,0360b15..970709c
--- 1/include/xen/interface/elfnote.h
--- 2/include/xen/interface/elfnote.h
+++ b/include/xen/interface/elfnote.h
@@@ -3,6 -3,6 +3,24 @@@
    *
    * Definitions used for the Xen ELF notes.
    *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Copyright (c) 2006, Ian Campbell, XenSource Ltd.
    */
   
@@@ -10,7 -10,7 +28,7 @@@
   #define __XEN_PUBLIC_ELFNOTE_H__
   
   /*
-- * The notes should live in a SHT_NOTE segment and have "Xen" in the
++ * The notes should live in a PT_NOTE segment and have "Xen" in the
    * name field.
    *
    * Numeric types are either 4 or 8 bytes depending on the content of
@@@ -22,8 -22,8 +40,6 @@@
   
   /*
    * NAME=VALUE pair (string).
-- *
-- * LEGACY: FEATURES and PAE
    */
   #define XEN_ELFNOTE_INFO           0
   
@@@ -90,7 -90,7 +106,12 @@@
   #define XEN_ELFNOTE_LOADER         8
   
   /*
-- * The kernel supports PAE (x86/32 only, string = "yes" or "no").
++ * The kernel supports PAE (x86/32 only, string = "yes", "no" or
++ * "bimodal").
++ *
++ * For compatibility with Xen 3.0.3 and earlier the "bimodal" setting
++ * may be given as "yes,bimodal" which will cause older Xen to treat
++ * this kernel as PAE.
    *
    * LEGACY: PAE (n.b. The legacy interface included a provision to
    * indicate 'extended-cr3' support allowing L3 page tables to be
@@@ -140,6 -140,6 +161,82 @@@
    */
   #define XEN_ELFNOTE_SUSPEND_CANCEL 14
   
++/*
++ * The (non-default) location the initial phys-to-machine map should be
++ * placed at by the hypervisor (Dom0) or the tools (DomU).
++ * The kernel must be prepared for this mapping to be established using
++ * large pages, despite such otherwise not being available to guests.
++ * The kernel must also be able to handle the page table pages used for
++ * this mapping not being accessible through the initial mapping.
++ * (Only x86-64 supports this at present.)
++ */
++#define XEN_ELFNOTE_INIT_P2M      15
++
++/*
++ * Whether or not the guest can deal with being passed an initrd not
++ * mapped through its initial page tables.
++ */
++#define XEN_ELFNOTE_MOD_START_PFN 16
++
++/*
++ * The number of the highest elfnote defined.
++ */
++#define XEN_ELFNOTE_MAX XEN_ELFNOTE_MOD_START_PFN
++
++/*
++ * System information exported through crash notes.
++ *
++ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_INFO
++ * note in case of a system crash. This note will contain various
++ * information about the system, see xen/include/xen/elfcore.h.
++ */
++#define XEN_ELFNOTE_CRASH_INFO 0x1000001
++
++/*
++ * System registers exported through crash notes.
++ *
++ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_REGS
++ * note per cpu in case of a system crash. This note is architecture
++ * specific and will contain registers not saved in the "CORE" note.
++ * See xen/include/xen/elfcore.h for more information.
++ */
++#define XEN_ELFNOTE_CRASH_REGS 0x1000002
++
++
++/*
++ * xen dump-core none note.
++ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_NONE
++ * in its dump file to indicate that the file is xen dump-core
++ * file. This note doesn't have any other information.
++ * See tools/libxc/xc_core.h for more information.
++ */
++#define XEN_ELFNOTE_DUMPCORE_NONE               0x2000000
++
++/*
++ * xen dump-core header note.
++ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_HEADER
++ * in its dump file.
++ * See tools/libxc/xc_core.h for more information.
++ */
++#define XEN_ELFNOTE_DUMPCORE_HEADER             0x2000001
++
++/*
++ * xen dump-core xen version note.
++ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_XEN_VERSION
++ * in its dump file. It contains the xen version obtained via the
++ * XENVER hypercall.
++ * See tools/libxc/xc_core.h for more information.
++ */
++#define XEN_ELFNOTE_DUMPCORE_XEN_VERSION        0x2000002
++
++/*
++ * xen dump-core format version note.
++ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION
++ * in its dump file. It contains a format version identifier.
++ * See tools/libxc/xc_core.h for more information.
++ */
++#define XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION     0x2000003
++
   #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
   
   /*
diff --cc include/xen/interface/event_channel.h

index 2090881,2090881..05df7fa
--- 1/include/xen/interface/event_channel.h
--- 2/include/xen/interface/event_channel.h
+++ b/include/xen/interface/event_channel.h
@@@ -3,6 -3,6 +3,24 @@@
    *
    * Event channels between domains.
    *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Copyright (c) 2003-2004, K A Fraser.
    */
   
@@@ -11,8 -11,8 +29,15 @@@
   
   #include <xen/interface/xen.h>
   
++/*
++ * Prototype for this hypercall is:
++ *  int event_channel_op(int cmd, void *args)
++ * @cmd  == EVTCHNOP_??? (event-channel operation).
++ * @args == Operation-specific extra arguments (NULL if none).
++ */
++
   typedef uint32_t evtchn_port_t;
--DEFINE_GUEST_HANDLE(evtchn_port_t);
++DEFINE_XEN_GUEST_HANDLE(evtchn_port_t);
   
   /*
    * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
@@@ -22,13 -22,13 +47,14 @@@
    *  1. If the caller is unprivileged then <dom> must be DOMID_SELF.
    *  2. <rdom> may be DOMID_SELF, allowing loopback connections.
    */
--#define EVTCHNOP_alloc_unbound          6
++#define EVTCHNOP_alloc_unbound    6
   struct evtchn_alloc_unbound {
--      /* IN parameters */
--      domid_t dom, remote_dom;
--      /* OUT parameters */
--      evtchn_port_t port;
++    /* IN parameters */
++    domid_t dom, remote_dom;
++    /* OUT parameters */
++    evtchn_port_t port;
   };
++typedef struct evtchn_alloc_unbound evtchn_alloc_unbound_t;
   
   /*
    * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
@@@ -41,29 -41,29 +67,35 @@@
    */
   #define EVTCHNOP_bind_interdomain 0
   struct evtchn_bind_interdomain {
--      /* IN parameters. */
--      domid_t remote_dom;
--      evtchn_port_t remote_port;
--      /* OUT parameters. */
--      evtchn_port_t local_port;
++    /* IN parameters. */
++    domid_t remote_dom;
++    evtchn_port_t remote_port;
++    /* OUT parameters. */
++    evtchn_port_t local_port;
   };
++typedef struct evtchn_bind_interdomain evtchn_bind_interdomain_t;
   
   /*
    * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
    * vcpu.
    * NOTES:
-- *  1. A virtual IRQ may be bound to at most one event channel per vcpu.
-- *  2. The allocated event channel is bound to the specified vcpu. The binding
-- *     may not be changed.
++ *  1. Virtual IRQs are classified as per-vcpu or global. See the VIRQ list
++ *     in xen.h for the classification of each VIRQ.
++ *  2. Global VIRQs must be allocated on VCPU0 but can subsequently be
++ *     re-bound via EVTCHNOP_bind_vcpu.
++ *  3. Per-vcpu VIRQs may be bound to at most one event channel per vcpu.
++ *     The allocated event channel is bound to the specified vcpu and the
++ *     binding cannot be changed.
    */
--#define EVTCHNOP_bind_virq      1
++#define EVTCHNOP_bind_virq        1
   struct evtchn_bind_virq {
--      /* IN parameters. */
--      uint32_t virq;
--      uint32_t vcpu;
--      /* OUT parameters. */
--      evtchn_port_t port;
++    /* IN parameters. */
++    uint32_t virq;
++    uint32_t vcpu;
++    /* OUT parameters. */
++    evtchn_port_t port;
   };
++typedef struct evtchn_bind_virq evtchn_bind_virq_t;
   
   /*
    * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
@@@ -71,15 -71,15 +103,16 @@@
    *  1. A physical IRQ may be bound to at most one event channel per domain.
    *  2. Only a sufficiently-privileged domain may bind to a physical IRQ.
    */
--#define EVTCHNOP_bind_pirq      2
++#define EVTCHNOP_bind_pirq        2
   struct evtchn_bind_pirq {
--      /* IN parameters. */
--      uint32_t pirq;
++    /* IN parameters. */
++    uint32_t pirq;
   #define BIND_PIRQ__WILL_SHARE 1
--      uint32_t flags; /* BIND_PIRQ__* */
--      /* OUT parameters. */
--      evtchn_port_t port;
++    uint32_t flags; /* BIND_PIRQ__* */
++    /* OUT parameters. */
++    evtchn_port_t port;
   };
++typedef struct evtchn_bind_pirq evtchn_bind_pirq_t;
   
   /*
    * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
@@@ -87,33 -87,33 +120,36 @@@
    *  1. The allocated event channel is bound to the specified vcpu. The binding
    *     may not be changed.
    */
--#define EVTCHNOP_bind_ipi       7
++#define EVTCHNOP_bind_ipi         7
   struct evtchn_bind_ipi {
--      uint32_t vcpu;
--      /* OUT parameters. */
--      evtchn_port_t port;
++    uint32_t vcpu;
++    /* OUT parameters. */
++    evtchn_port_t port;
   };
++typedef struct evtchn_bind_ipi evtchn_bind_ipi_t;
   
   /*
    * EVTCHNOP_close: Close a local event channel <port>. If the channel is
    * interdomain then the remote end is placed in the unbound state
    * (EVTCHNSTAT_unbound), awaiting a new connection.
    */
--#define EVTCHNOP_close                  3
++#define EVTCHNOP_close            3
   struct evtchn_close {
--      /* IN parameters. */
--      evtchn_port_t port;
++    /* IN parameters. */
++    evtchn_port_t port;
   };
++typedef struct evtchn_close evtchn_close_t;
   
   /*
    * EVTCHNOP_send: Send an event to the remote end of the channel whose local
    * endpoint is <port>.
    */
--#define EVTCHNOP_send           4
++#define EVTCHNOP_send             4
   struct evtchn_send {
--      /* IN parameters. */
--      evtchn_port_t port;
++    /* IN parameters. */
++    evtchn_port_t port;
   };
++typedef struct evtchn_send evtchn_send_t;
   
   /*
    * EVTCHNOP_status: Get the current status of the communication channel which
@@@ -123,75 -123,75 +159,99 @@@
    *  2. Only a sufficiently-privileged domain may obtain the status of an event
    *     channel for which <dom> is not DOMID_SELF.
    */
--#define EVTCHNOP_status                 5
++#define EVTCHNOP_status           5
   struct evtchn_status {
--      /* IN parameters */
--      domid_t  dom;
--      evtchn_port_t port;
--      /* OUT parameters */
--#define EVTCHNSTAT_closed     0  /* Channel is not in use.                 */
--#define EVTCHNSTAT_unbound    1  /* Channel is waiting interdom connection.*/
--#define EVTCHNSTAT_interdomain        2  /* Channel is connected to remote domain. */
--#define EVTCHNSTAT_pirq               3  /* Channel is bound to a phys IRQ line.   */
--#define EVTCHNSTAT_virq               4  /* Channel is bound to a virtual IRQ line */
--#define EVTCHNSTAT_ipi                5  /* Channel is bound to a virtual IPI line */
--      uint32_t status;
--      uint32_t vcpu;             /* VCPU to which this channel is bound.   */
--      union {
--              struct {
--                      domid_t dom;
--              } unbound; /* EVTCHNSTAT_unbound */
--              struct {
--                      domid_t dom;
--                      evtchn_port_t port;
--              } interdomain; /* EVTCHNSTAT_interdomain */
--              uint32_t pirq;      /* EVTCHNSTAT_pirq        */
--              uint32_t virq;      /* EVTCHNSTAT_virq        */
--      } u;
++    /* IN parameters */
++    domid_t  dom;
++    evtchn_port_t port;
++    /* OUT parameters */
++#define EVTCHNSTAT_closed       0  /* Channel is not in use.                 */
++#define EVTCHNSTAT_unbound      1  /* Channel is waiting interdom connection.*/
++#define EVTCHNSTAT_interdomain  2  /* Channel is connected to remote domain. */
++#define EVTCHNSTAT_pirq         3  /* Channel is bound to a phys IRQ line.   */
++#define EVTCHNSTAT_virq         4  /* Channel is bound to a virtual IRQ line */
++#define EVTCHNSTAT_ipi          5  /* Channel is bound to a virtual IPI line */
++    uint32_t status;
++    uint32_t vcpu;                 /* VCPU to which this channel is bound.   */
++    union {
++        struct {
++            domid_t dom;
++        } unbound; /* EVTCHNSTAT_unbound */
++        struct {
++            domid_t dom;
++            evtchn_port_t port;
++        } interdomain; /* EVTCHNSTAT_interdomain */
++        uint32_t pirq;      /* EVTCHNSTAT_pirq        */
++        uint32_t virq;      /* EVTCHNSTAT_virq        */
++    } u;
   };
++typedef struct evtchn_status evtchn_status_t;
   
   /*
    * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
    * event is pending.
    * NOTES:
-- *  1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
-- *     the binding. This binding cannot be changed.
-- *  2. All other channels notify vcpu0 by default. This default is set when
++ *  1. IPI-bound channels always notify the vcpu specified at bind time.
++ *     This binding cannot be changed.
++ *  2. Per-VCPU VIRQ channels always notify the vcpu specified at bind time.
++ *     This binding cannot be changed.
++ *  3. All other channels notify vcpu0 by default. This default is set when
    *     the channel is allocated (a port that is freed and subsequently reused
    *     has its binding reset to vcpu0).
    */
--#define EVTCHNOP_bind_vcpu      8
++#define EVTCHNOP_bind_vcpu        8
   struct evtchn_bind_vcpu {
--      /* IN parameters. */
--      evtchn_port_t port;
--      uint32_t vcpu;
++    /* IN parameters. */
++    evtchn_port_t port;
++    uint32_t vcpu;
   };
++typedef struct evtchn_bind_vcpu evtchn_bind_vcpu_t;
   
   /*
    * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
    * a notification to the appropriate VCPU if an event is pending.
    */
--#define EVTCHNOP_unmask                 9
++#define EVTCHNOP_unmask           9
   struct evtchn_unmask {
--      /* IN parameters. */
--      evtchn_port_t port;
++    /* IN parameters. */
++    evtchn_port_t port;
   };
++typedef struct evtchn_unmask evtchn_unmask_t;
   
++/*
++ * EVTCHNOP_reset: Close all event channels associated with specified domain.
++ * NOTES:
++ *  1. <dom> may be specified as DOMID_SELF.
++ *  2. Only a sufficiently-privileged domain may specify other than DOMID_SELF.
++ */
++#define EVTCHNOP_reset           10
++struct evtchn_reset {
++    /* IN parameters. */
++    domid_t dom;
++};
++typedef struct evtchn_reset evtchn_reset_t;
++
++/*
++ * Argument to event_channel_op_compat() hypercall. Superceded by new
++ * event_channel_op() hypercall since 0x00030202.
++ */
   struct evtchn_op {
--      uint32_t cmd; /* EVTCHNOP_* */
--      union {
--              struct evtchn_alloc_unbound    alloc_unbound;
--              struct evtchn_bind_interdomain bind_interdomain;
--              struct evtchn_bind_virq        bind_virq;
--              struct evtchn_bind_pirq        bind_pirq;
--              struct evtchn_bind_ipi         bind_ipi;
--              struct evtchn_close            close;
--              struct evtchn_send             send;
--              struct evtchn_status           status;
--              struct evtchn_bind_vcpu        bind_vcpu;
--              struct evtchn_unmask           unmask;
--      } u;
++    uint32_t cmd; /* EVTCHNOP_* */
++    union {
++        struct evtchn_alloc_unbound    alloc_unbound;
++        struct evtchn_bind_interdomain bind_interdomain;
++        struct evtchn_bind_virq        bind_virq;
++        struct evtchn_bind_pirq        bind_pirq;
++        struct evtchn_bind_ipi         bind_ipi;
++        struct evtchn_close            close;
++        struct evtchn_send             send;
++        struct evtchn_status           status;
++        struct evtchn_bind_vcpu        bind_vcpu;
++        struct evtchn_unmask           unmask;
++    } u;
   };
   DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);
++typedef struct evtchn_op evtchn_op_t;
++DEFINE_XEN_GUEST_HANDLE(evtchn_op_t);
   
   #endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
diff --cc include/xen/interface/features.h

index b6ca39a,b6ca39a..2a08c84
--- 1/include/xen/interface/features.h
--- 2/include/xen/interface/features.h
+++ b/include/xen/interface/features.h
@@@ -3,6 -3,6 +3,24 @@@
    *
    * Feature flags, reported by XENVER_get_features.
    *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
    */
   
@@@ -41,6 -41,6 +59,15 @@@
   /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
   #define XENFEAT_mmu_pt_update_preserve_ad  5
   
++/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */
++#define XENFEAT_highmem_assist             6
++
++/*
++ * If set, GNTTABOP_map_grant_ref honors flags to be placed into guest kernel
++ * available pte bits.
++ */
++#define XENFEAT_gnttab_map_avail_bits      7
++
   /* x86: Does this Xen host support the HVM callback vector type? */
   #define XENFEAT_hvm_callback_vector        8
   
diff --cc include/xen/interface/grant_table.h

index 39e5717,39e5717..67216c6
--- 1/include/xen/interface/grant_table.h
--- 2/include/xen/interface/grant_table.h
+++ b/include/xen/interface/grant_table.h
@@@ -85,12 -85,12 +85,26 @@@
    */
   
   /*
++ * Reference to a grant entry in a specified domain's grant table.
++ */
++typedef uint32_t grant_ref_t;
++
++/*
    * A grant table comprises a packed array of grant entries in one or more
    * page frames shared between Xen and a guest.
    * [XEN]: This field is written by Xen and read by the sharing guest.
    * [GST]: This field is written by the guest and read by Xen.
    */
--struct grant_entry {
++
++/*
++ * Version 1 of the grant table entry structure is maintained purely
++ * for backwards compatibility.  New guests should use version 2.
++ */
++#if __XEN_INTERFACE_VERSION__ < 0x0003020a
++#define grant_entry_v1 grant_entry
++#define grant_entry_v1_t grant_entry_t
++#endif
++struct grant_entry_v1 {
       /* GTF_xxx: various type and flag information.  [XEN,GST] */
       uint16_t flags;
       /* The domain being granted foreign privileges. [GST] */
@@@ -101,6 -101,6 +115,7 @@@
        */
       uint32_t frame;
   };
++typedef struct grant_entry_v1 grant_entry_v1_t;
   
   /*
    * Type of grant entry.
@@@ -108,10 -108,10 +123,13 @@@
    *  GTF_permit_access: Allow @domid to map/access @frame.
    *  GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
    *                       to this guest. Xen writes the page number to @frame.
++ *  GTF_transitive: Allow @domid to transitively access a subrange of
++ *                  @trans_grant in @trans_domid.  No mappings are allowed.
    */
   #define GTF_invalid         (0U<<0)
   #define GTF_permit_access   (1U<<0)
   #define GTF_accept_transfer (2U<<0)
++#define GTF_transitive      (3U<<0)
   #define GTF_type_mask       (3U<<0)
   
   /*
@@@ -119,6 -119,6 +137,10 @@@
    *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
    *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
    *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
++ *  GTF_PAT, GTF_PWT, GTF_PCD: (x86) cache attribute flags for the grant [GST]
++ *  GTF_sub_page: Grant access to only a subrange of the page.  @domid
++ *                will only be allowed to copy from the grant, and not
++ *                map it. [GST]
    */
   #define _GTF_readonly       (2)
   #define GTF_readonly        (1U<<_GTF_readonly)
@@@ -126,6 -126,6 +148,14 @@@
   #define GTF_reading         (1U<<_GTF_reading)
   #define _GTF_writing        (4)
   #define GTF_writing         (1U<<_GTF_writing)
++#define _GTF_PWT            (5)
++#define GTF_PWT             (1U<<_GTF_PWT)
++#define _GTF_PCD            (6)
++#define GTF_PCD             (1U<<_GTF_PCD)
++#define _GTF_PAT            (7)
++#define GTF_PAT             (1U<<_GTF_PAT)
++#define _GTF_sub_page       (8)
++#define GTF_sub_page        (1U<<_GTF_sub_page)
   
   /*
    * Subflags for GTF_accept_transfer:
@@@ -142,15 -142,15 +172,87 @@@
   #define _GTF_transfer_completed (3)
   #define GTF_transfer_completed  (1U<<_GTF_transfer_completed)
   
--
--/***********************************
-- * GRANT TABLE QUERIES AND USES
++/*
++ * Version 2 grant table entries.  These fulfil the same role as
++ * version 1 entries, but can represent more complicated operations.
++ * Any given domain will have either a version 1 or a version 2 table,
++ * and every entry in the table will be the same version.
++ *
++ * The interface by which domains use grant references does not depend
++ * on the grant table version in use by the other domain.
    */
++#if __XEN_INTERFACE_VERSION__ >= 0x0003020a
++/*
++ * Version 1 and version 2 grant entries share a common prefix.  The
++ * fields of the prefix are documented as part of struct
++ * grant_entry_v1.
++ */
++struct grant_entry_header {
++    uint16_t flags;
++    domid_t  domid;
++};
++typedef struct grant_entry_header grant_entry_header_t;
   
   /*
-- * Reference to a grant entry in a specified domain's grant table.
++ * Version 2 of the grant entry structure.
++ */
++union grant_entry_v2 {
++    grant_entry_header_t hdr;
++
++    /*
++     * This member is used for V1-style full page grants, where either:
++     *
++     * -- hdr.type is GTF_accept_transfer, or
++     * -- hdr.type is GTF_permit_access and GTF_sub_page is not set.
++     *
++     * In that case, the frame field has the same semantics as the
++     * field of the same name in the V1 entry structure.
++     */
++    struct {
++        grant_entry_header_t hdr;
++        uint32_t pad0;
++        uint64_t frame;
++    } full_page;
++
++    /*
++     * If the grant type is GTF_grant_access and GTF_sub_page is set,
++     * @domid is allowed to access bytes [@page_off,@page_off+@length)
++     * in frame @frame.
++     */
++    struct {
++        grant_entry_header_t hdr;
++        uint16_t page_off;
++        uint16_t length;
++        uint64_t frame;
++    } sub_page;
++
++    /*
++     * If the grant is GTF_transitive, @domid is allowed to use the
++     * grant @gref in domain @trans_domid, as if it was the local
++     * domain.  Obviously, the transitive access must be compatible
++     * with the original grant.
++     *
++     * The current version of Xen does not allow transitive grants
++     * to be mapped.
++     */
++    struct {
++        grant_entry_header_t hdr;
++        domid_t trans_domid;
++        uint16_t pad0;
++        grant_ref_t gref;
++    } transitive;
++
++    uint32_t __spacer[4]; /* Pad to a power of two */
++};
++typedef union grant_entry_v2 grant_entry_v2_t;
++
++typedef uint16_t grant_status_t;
++
++#endif /* __XEN_INTERFACE_VERSION__ */
++
++/***********************************
++ * GRANT TABLE QUERIES AND USES
    */
--typedef uint32_t grant_ref_t;
   
   /*
    * Handle to track a mapping created via a grant reference.
@@@ -187,6 -187,6 +289,8 @@@ struct gnttab_map_grant_ref 
       uint64_t dev_bus_addr;
   };
   DEFINE_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
++typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
++DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);
   
   /*
    * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
@@@ -209,6 -209,6 +313,8 @@@ struct gnttab_unmap_grant_ref 
       int16_t  status;              /* GNTST_* */
   };
   DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
++typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
++DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
   
   /*
    * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
@@@ -226,9 -226,9 +332,11 @@@ struct gnttab_setup_table 
       uint32_t nr_frames;
       /* OUT parameters. */
       int16_t  status;              /* GNTST_* */
--    GUEST_HANDLE(ulong) frame_list;
++    XEN_GUEST_HANDLE(ulong) frame_list;
   };
   DEFINE_GUEST_HANDLE_STRUCT(gnttab_setup_table);
++typedef struct gnttab_setup_table gnttab_setup_table_t;
++DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
   
   /*
    * GNTTABOP_dump_table: Dump the contents of the grant table to the
@@@ -242,6 -242,6 +350,8 @@@ struct gnttab_dump_table 
       int16_t status;               /* GNTST_* */
   };
   DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table);
++typedef struct gnttab_dump_table gnttab_dump_table_t;
++DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);
   
   /*
    * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
@@@ -254,13 -254,13 +364,16 @@@
   #define GNTTABOP_transfer                4
   struct gnttab_transfer {
       /* IN parameters. */
--    unsigned long mfn;
++    xen_pfn_t     mfn;
       domid_t       domid;
       grant_ref_t   ref;
       /* OUT parameters. */
       int16_t       status;
   };
   DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer);
++typedef struct gnttab_transfer gnttab_transfer_t;
++DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
++
   
   /*
    * GNTTABOP_copy: Hypervisor based copy
@@@ -284,24 -284,24 +397,27 @@@
   #define GNTCOPY_source_gref       (1<<_GNTCOPY_source_gref)
   #define _GNTCOPY_dest_gref        (1)
   #define GNTCOPY_dest_gref         (1<<_GNTCOPY_dest_gref)
++#define _GNTCOPY_can_fail         (2)
++#define GNTCOPY_can_fail          (1<<_GNTCOPY_can_fail)
   
   #define GNTTABOP_copy                 5
--struct gnttab_copy {
--      /* IN parameters. */
--      struct {
--              union {
--                      grant_ref_t ref;
--                      unsigned long   gmfn;
--              } u;
--              domid_t  domid;
--              uint16_t offset;
--      } source, dest;
--      uint16_t      len;
--      uint16_t      flags;          /* GNTCOPY_* */
--      /* OUT parameters. */
--      int16_t       status;
--};
++typedef struct gnttab_copy {
++    /* IN parameters. */
++    struct {
++        union {
++            grant_ref_t ref;
++            xen_pfn_t   gmfn;
++        } u;
++        domid_t  domid;
++        uint16_t offset;
++    } source, dest;
++    uint16_t      len;
++    uint16_t      flags;          /* GNTCOPY_* */
++    /* OUT parameters. */
++    int16_t       status;
++} gnttab_copy_t;
   DEFINE_GUEST_HANDLE_STRUCT(gnttab_copy);
++DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t);
   
   /*
    * GNTTABOP_query_size: Query the current and maximum sizes of the shared
@@@ -320,9 -320,9 +436,92 @@@ struct gnttab_query_size 
       int16_t  status;              /* GNTST_* */
   };
   DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
++typedef struct gnttab_query_size gnttab_query_size_t;
++DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t);
   
   /*
-- * Bitfield values for update_pin_status.flags.
++ * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings
++ * tracked by <handle> but atomically replace the page table entry with one
++ * pointing to the machine address under <new_addr>.  <new_addr> will be
++ * redirected to the null entry.
++ * NOTES:
++ *  1. The call may fail in an undefined manner if either mapping is not
++ *     tracked by <handle>.
++ *  2. After executing a batch of unmaps, it is guaranteed that no stale
++ *     mappings will remain in the device or host TLBs.
++ */
++#define GNTTABOP_unmap_and_replace    7
++struct gnttab_unmap_and_replace {
++    /* IN parameters. */
++    uint64_t host_addr;
++    uint64_t new_addr;
++    grant_handle_t handle;
++    /* OUT parameters. */
++    int16_t  status;              /* GNTST_* */
++};
++typedef struct gnttab_unmap_and_replace gnttab_unmap_and_replace_t;
++DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t);
++
++#if __XEN_INTERFACE_VERSION__ >= 0x0003020a
++/*
++ * GNTTABOP_set_version: Request a particular version of the grant
++ * table shared table structure.  This operation can only be performed
++ * once in any given domain.  It must be performed before any grants
++ * are activated; otherwise, the domain will be stuck with version 1.
++ * The only defined versions are 1 and 2.
++ */
++#define GNTTABOP_set_version          8
++struct gnttab_set_version {
++    /* IN/OUT parameters */
++    uint32_t version;
++};
++typedef struct gnttab_set_version gnttab_set_version_t;
++DEFINE_XEN_GUEST_HANDLE(gnttab_set_version_t);
++
++
++/*
++ * GNTTABOP_get_status_frames: Get the list of frames used to store grant
++ * status for <dom>. In grant format version 2, the status is separated
++ * from the other shared grant fields to allow more efficient synchronization
++ * using barriers instead of atomic cmpexch operations.
++ * <nr_frames> specify the size of vector <frame_list>.
++ * The frame addresses are returned in the <frame_list>.
++ * Only <nr_frames> addresses are returned, even if the table is larger.
++ * NOTES:
++ *  1. <dom> may be specified as DOMID_SELF.
++ *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
++ */
++#define GNTTABOP_get_status_frames     9
++struct gnttab_get_status_frames {
++    /* IN parameters. */
++    uint32_t nr_frames;
++    domid_t  dom;
++    /* OUT parameters. */
++    int16_t  status;              /* GNTST_* */
++    XEN_GUEST_HANDLE(uint64_t) frame_list;
++};
++typedef struct gnttab_get_status_frames gnttab_get_status_frames_t;
++DEFINE_XEN_GUEST_HANDLE(gnttab_get_status_frames_t);
++
++/*
++ * GNTTABOP_get_version: Get the grant table version which is in
++ * effect for domain <dom>.
++ */
++#define GNTTABOP_get_version          10
++struct gnttab_get_version {
++    /* IN parameters */
++    domid_t dom;
++    uint16_t pad;
++    /* OUT parameters */
++    uint32_t version;
++};
++typedef struct gnttab_get_version gnttab_get_version_t;
++DEFINE_XEN_GUEST_HANDLE(gnttab_get_version_t);
++
++#endif /* __XEN_INTERFACE_VERSION__ */
++
++/*
++ * Bitfield values for gnttab_map_grant_ref.flags.
    */
    /* Map the grant entry for access by I/O devices. */
   #define _GNTMAP_device_map      (0)
@@@ -349,6 -349,6 +548,16 @@@
   #define _GNTMAP_contains_pte    (4)
   #define GNTMAP_contains_pte     (1<<_GNTMAP_contains_pte)
   
++#define _GNTMAP_can_fail        (5)
++#define GNTMAP_can_fail         (1<<_GNTMAP_can_fail)
++
++/*
++ * Bits to be placed in guest kernel available PTE bits (architecture
++ * dependent; only supported when XENFEAT_gnttab_map_avail_bits is set).
++ */
++#define _GNTMAP_guest_avail0    (16)
++#define GNTMAP_guest_avail_mask ((uint32_t)~0 << _GNTMAP_guest_avail0)
++
   /*
    * Values for error status returns. All errors are -ve.
    */
@@@ -362,7 -362,7 +571,9 @@@
   #define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
   #define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
   #define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
--#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary */
++#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary.   */
++#define GNTST_address_too_big (-11) /* transfer page address too large.      */
++#define GNTST_eagain          (-12) /* Could not map at the moment. Retry.   */
   
   #define GNTTABOP_error_msgs {                   \
       "okay",                                     \
@@@ -375,7 -375,7 +586,9 @@@
       "no spare translation slot in the I/O MMU", \
       "permission denied",                        \
       "bad page",                                 \
--    "copy arguments cross page boundary"        \
++    "copy arguments cross page boundary",       \
++    "page address size too large",              \
++    "could not map at the moment, retry"        \
   }
   
   #endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
diff --cc include/xen/interface/hvm/e820.h

index 0000000,0000000..5bdc227

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/hvm/e820.h
@@@ -1,0 -1,0 +1,34 @@@
++
++/*
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_PUBLIC_HVM_E820_H__
++#define __XEN_PUBLIC_HVM_E820_H__
++
++/* E820 location in HVM virtual address space. */
++#define HVM_E820_PAGE        0x00090000
++#define HVM_E820_NR_OFFSET   0x000001E8
++#define HVM_E820_OFFSET      0x000002D0
++
++#define HVM_BELOW_4G_RAM_END        0xF0000000
++#define HVM_BELOW_4G_MMIO_START     HVM_BELOW_4G_RAM_END
++#define HVM_BELOW_4G_MMIO_LENGTH    ((1ULL << 32) - HVM_BELOW_4G_MMIO_START)
++
++#endif /* __XEN_PUBLIC_HVM_E820_H__ */
diff --cc include/xen/interface/hvm/hvm_info_table.h

index 0000000,0000000..bdb5995

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/hvm/hvm_info_table.h
@@@ -1,0 -1,0 +1,75 @@@
++/******************************************************************************
++ * hvm/hvm_info_table.h
++ * 
++ * HVM parameter and information table, written into guest memory map.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
++#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
++
++#define HVM_INFO_PFN         0x09F
++#define HVM_INFO_OFFSET      0x800
++#define HVM_INFO_PADDR       ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
++
++/* Maximum we can support with current vLAPIC ID mapping. */
++#define HVM_MAX_VCPUS        128
++
++struct hvm_info_table {
++    char        signature[8]; /* "HVM INFO" */
++    uint32_t    length;
++    uint8_t     checksum;
++
++    /* Should firmware build ACPI tables? */
++    uint8_t     acpi_enabled;
++
++    /* Should firmware build APIC descriptors (APIC MADT / MP BIOS)? */
++    uint8_t     apic_mode;
++
++    /* How many CPUs does this domain have? */
++    uint32_t    nr_vcpus;
++
++    /*
++     * MEMORY MAP provided by HVM domain builder.
++     * Notes:
++     *  1. page_to_phys(x) = x << 12
++     *  2. If a field is zero, the corresponding range does not exist.
++     */
++    /*
++     *  0x0 to page_to_phys(low_mem_pgend)-1:
++     *    RAM below 4GB (except for VGA hole 0xA0000-0xBFFFF)
++     */
++    uint32_t    low_mem_pgend;
++    /*
++     *  page_to_phys(reserved_mem_pgstart) to 0xFFFFFFFF:
++     *    Reserved for special memory mappings
++     */
++    uint32_t    reserved_mem_pgstart;
++    /*
++     *  0x100000000 to page_to_phys(high_mem_pgend)-1:
++     *    RAM above 4GB
++     */
++    uint32_t    high_mem_pgend;
++
++    /* Bitmap of which CPUs are online at boot time. */
++    uint8_t     vcpu_online[(HVM_MAX_VCPUS + 7)/8];
++};
++
++#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
diff --cc include/xen/interface/hvm/hvm_op.h

index a4827f4,a4827f4..f52b913
--- 1/include/xen/interface/hvm/hvm_op.h
--- 2/include/xen/interface/hvm/hvm_op.h
+++ b/include/xen/interface/hvm/hvm_op.h
@@@ -21,6 -21,6 +21,9 @@@
   #ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
   #define __XEN_PUBLIC_HVM_HVM_OP_H__
   
++#include "../xen.h"
++#include "../trace.h"
++
   /* Get/set subcommands: the second argument of the hypercall is a
    * pointer to a xen_hvm_param struct. */
   #define HVMOP_set_param           0
@@@ -31,16 -31,16 +34,213 @@@ struct xen_hvm_param 
       uint64_t value;    /* IN/OUT */
   };
   DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_param);
++typedef struct xen_hvm_param xen_hvm_param_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_param_t);
++
++/* Set the logical level of one of a domain's PCI INTx wires. */
++#define HVMOP_set_pci_intx_level  2
++struct xen_hvm_set_pci_intx_level {
++    /* Domain to be updated. */
++    domid_t  domid;
++    /* PCI INTx identification in PCI topology (domain:bus:device:intx). */
++    uint8_t  domain, bus, device, intx;
++    /* Assertion level (0 = unasserted, 1 = asserted). */
++    uint8_t  level;
++};
++typedef struct xen_hvm_set_pci_intx_level xen_hvm_set_pci_intx_level_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t);
++
++/* Set the logical level of one of a domain's ISA IRQ wires. */
++#define HVMOP_set_isa_irq_level   3
++struct xen_hvm_set_isa_irq_level {
++    /* Domain to be updated. */
++    domid_t  domid;
++    /* ISA device identification, by ISA IRQ (0-15). */
++    uint8_t  isa_irq;
++    /* Assertion level (0 = unasserted, 1 = asserted). */
++    uint8_t  level;
++};
++typedef struct xen_hvm_set_isa_irq_level xen_hvm_set_isa_irq_level_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t);
++
++#define HVMOP_set_pci_link_route  4
++struct xen_hvm_set_pci_link_route {
++    /* Domain to be updated. */
++    domid_t  domid;
++    /* PCI link identifier (0-3). */
++    uint8_t  link;
++    /* ISA IRQ (1-15), or 0 (disable link). */
++    uint8_t  isa_irq;
++};
++typedef struct xen_hvm_set_pci_link_route xen_hvm_set_pci_link_route_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t);
++
++/* Flushes all VCPU TLBs: @arg must be NULL. */
++#define HVMOP_flush_tlbs          5
++
++typedef enum {
++    HVMMEM_ram_rw,             /* Normal read/write guest RAM */
++    HVMMEM_ram_ro,             /* Read-only; writes are discarded */
++    HVMMEM_mmio_dm,            /* Reads and write go to the device model */
++} hvmmem_type_t;
++
++/* Following tools-only interfaces may change in future. */
++#if defined(__XEN__) || defined(__XEN_TOOLS__)
++
++/* Track dirty VRAM. */
++#define HVMOP_track_dirty_vram    6
++struct xen_hvm_track_dirty_vram {
++    /* Domain to be tracked. */
++    domid_t  domid;
++    /* First pfn to track. */
++    uint64_aligned_t first_pfn;
++    /* Number of pages to track. */
++    uint64_aligned_t nr;
++    /* OUT variable. */
++    /* Dirty bitmap buffer. */
++    XEN_GUEST_HANDLE_64(uint8) dirty_bitmap;
++};
++typedef struct xen_hvm_track_dirty_vram xen_hvm_track_dirty_vram_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_track_dirty_vram_t);
++
++/* Notify that some pages got modified by the Device Model. */
++#define HVMOP_modified_memory    7
++struct xen_hvm_modified_memory {
++    /* Domain to be updated. */
++    domid_t  domid;
++    /* First pfn. */
++    uint64_aligned_t first_pfn;
++    /* Number of pages. */
++    uint64_aligned_t nr;
++};
++typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t);
++
++#define HVMOP_set_mem_type    8
++/* Notify that a region of memory is to be treated in a specific way. */
++struct xen_hvm_set_mem_type {
++    /* Domain to be updated. */
++    domid_t domid;
++    /* Memory type */
++    uint16_t hvmmem_type;
++    /* Number of pages. */
++    uint32_t nr;
++    /* First pfn. */
++    uint64_aligned_t first_pfn;
++};
++typedef struct xen_hvm_set_mem_type xen_hvm_set_mem_type_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_type_t);
++
++#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
   
   /* Hint from PV drivers for pagetable destruction. */
   #define HVMOP_pagetable_dying       9
   struct xen_hvm_pagetable_dying {
       /* Domain with a pagetable about to be destroyed. */
       domid_t  domid;
++    uint16_t pad[3]; /* align next field on 8-byte boundary */
       /* guest physical address of the toplevel pagetable dying */
--    aligned_u64 gpa;
++    uint64_t gpa;
   };
++DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying);
   typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t;
--DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying_t);
-- 
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_pagetable_dying_t);
++
++/* Get the current Xen time, in nanoseconds since system boot. */
++#define HVMOP_get_time              10
++struct xen_hvm_get_time {
++    uint64_t now;      /* OUT */
++};
++typedef struct xen_hvm_get_time xen_hvm_get_time_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_time_t);
++
++#define HVMOP_xentrace              11
++struct xen_hvm_xentrace {
++    uint16_t event, extra_bytes;
++    uint8_t extra[TRACE_EXTRA_MAX * sizeof(uint32_t)];
++};
++typedef struct xen_hvm_xentrace xen_hvm_xentrace_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_xentrace_t);
++
++/* Following tools-only interfaces may change in future. */
++#if defined(__XEN__) || defined(__XEN_TOOLS__)
++
++#define HVMOP_set_mem_access        12
++typedef enum {
++    HVMMEM_access_n,
++    HVMMEM_access_r,
++    HVMMEM_access_w,
++    HVMMEM_access_rw,
++    HVMMEM_access_x,
++    HVMMEM_access_rx,
++    HVMMEM_access_wx,
++    HVMMEM_access_rwx,
++    HVMMEM_access_rx2rw,       /* Page starts off as r-x, but automatically
++                                * change to r-w on a write */
++    HVMMEM_access_default      /* Take the domain default */
++} hvmmem_access_t;
++/* Notify that a region of memory is to have specific access types */
++struct xen_hvm_set_mem_access {
++    /* Domain to be updated. */
++    domid_t domid;
++    /* Memory type */
++    uint16_t hvmmem_access; /* hvm_access_t */
++    /* Number of pages, ignored on setting default access */
++    uint32_t nr;
++    /* First pfn, or ~0ull to set the default access for new pages */
++    uint64_aligned_t first_pfn;
++};
++typedef struct xen_hvm_set_mem_access xen_hvm_set_mem_access_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_access_t);
++
++#define HVMOP_get_mem_access        13
++/* Get the specific access type for that region of memory */
++struct xen_hvm_get_mem_access {
++    /* Domain to be queried. */
++    domid_t domid;
++    /* Memory type: OUT */
++    uint16_t hvmmem_access; /* hvm_access_t */
++    /* pfn, or ~0ull for default access for new pages.  IN */
++    uint64_aligned_t pfn;
++};
++typedef struct xen_hvm_get_mem_access xen_hvm_get_mem_access_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_mem_access_t);
++
++#define HVMOP_inject_trap            14
++/* Inject a trap into a VCPU, which will get taken up on the next
++ * scheduling of it. Note that the caller should know enough of the
++ * state of the CPU before injecting, to know what the effect of
++ * injecting the trap will be.
++ */
++struct xen_hvm_inject_trap {
++    /* Domain to be queried. */
++    domid_t domid;
++    /* VCPU */
++    uint32_t vcpuid;
++    /* Trap number */
++    uint32_t trap;
++    /* Error code, or -1 to skip */
++    uint32_t error_code;
++    /* CR2 for page faults */
++    uint64_aligned_t cr2;
++};
++typedef struct xen_hvm_inject_trap xen_hvm_inject_trap_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_inject_trap_t);
++
++#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
++
++#define HVMOP_get_mem_type    15
++/* Return hvmmem_type_t for the specified pfn. */
++struct xen_hvm_get_mem_type {
++    /* Domain to be queried. */
++    domid_t domid;
++    /* OUT variable. */
++    uint16_t mem_type;
++    uint16_t pad[2]; /* align next field on 8-byte boundary */
++    /* IN variable. */
++    uint64_t pfn;
++};
++typedef struct xen_hvm_get_mem_type xen_hvm_get_mem_type_t;
++DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_mem_type_t);
++
   #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
diff --cc include/xen/interface/hvm/ioreq.h

index 0000000,0000000..4022a1d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/hvm/ioreq.h
@@@ -1,0 -1,0 +1,140 @@@
++/*
++ * ioreq.h: I/O request definitions for device models
++ * Copyright (c) 2004, Intel Corporation.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef _IOREQ_H_
++#define _IOREQ_H_
++
++#define IOREQ_READ      1
++#define IOREQ_WRITE     0
++
++#define STATE_IOREQ_NONE        0
++#define STATE_IOREQ_READY       1
++#define STATE_IOREQ_INPROCESS   2
++#define STATE_IORESP_READY      3
++
++#define IOREQ_TYPE_PIO          0 /* pio */
++#define IOREQ_TYPE_COPY         1 /* mmio ops */
++#define IOREQ_TYPE_TIMEOFFSET   7
++#define IOREQ_TYPE_INVALIDATE   8 /* mapcache */
++
++/*
++ * VMExit dispatcher should cooperate with instruction decoder to
++ * prepare this structure and notify service OS and DM by sending
++ * virq
++ */
++struct ioreq {
++    uint64_t addr;          /* physical address */
++    uint64_t data;          /* data (or paddr of data) */
++    uint32_t count;         /* for rep prefixes */
++    uint32_t size;          /* size in bytes */
++    uint32_t vp_eport;      /* evtchn for notifications to/from device model */
++    uint16_t _pad0;
++    uint8_t state:4;
++    uint8_t data_is_ptr:1;  /* if 1, data above is the guest paddr 
++                             * of the real data to use. */
++    uint8_t dir:1;          /* 1=read, 0=write */
++    uint8_t df:1;
++    uint8_t _pad1:1;
++    uint8_t type;           /* I/O type */
++};
++typedef struct ioreq ioreq_t;
++
++struct shared_iopage {
++    struct ioreq vcpu_ioreq[1];
++};
++typedef struct shared_iopage shared_iopage_t;
++
++struct buf_ioreq {
++    uint8_t  type;   /* I/O type                    */
++    uint8_t  pad:1;
++    uint8_t  dir:1;  /* 1=read, 0=write             */
++    uint8_t  size:2; /* 0=>1, 1=>2, 2=>4, 3=>8. If 8, use two buf_ioreqs */
++    uint32_t addr:20;/* physical address            */
++    uint32_t data;   /* data                        */
++};
++typedef struct buf_ioreq buf_ioreq_t;
++
++#define IOREQ_BUFFER_SLOT_NUM     511 /* 8 bytes each, plus 2 4-byte indexes */
++struct buffered_iopage {
++    unsigned int read_pointer;
++    unsigned int write_pointer;
++    buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM];
++}; /* NB. Size of this structure must be no greater than one page. */
++typedef struct buffered_iopage buffered_iopage_t;
++
++#if defined(__ia64__)
++struct pio_buffer {
++    uint32_t page_offset;
++    uint32_t pointer;
++    uint32_t data_end;
++    uint32_t buf_size;
++    void *opaque;
++};
++
++#define PIO_BUFFER_IDE_PRIMARY   0 /* I/O port = 0x1F0 */
++#define PIO_BUFFER_IDE_SECONDARY 1 /* I/O port = 0x170 */
++#define PIO_BUFFER_ENTRY_NUM     2
++struct buffered_piopage {
++    struct pio_buffer pio[PIO_BUFFER_ENTRY_NUM];
++    uint8_t buffer[1];
++};
++#endif /* defined(__ia64__) */
++
++/*
++ * ACPI Control/Event register locations. Location is controlled by a 
++ * version number in HVM_PARAM_ACPI_IOPORTS_LOCATION.
++ */
++
++/* Version 0 (default): Traditional Xen locations. */
++#define ACPI_PM1A_EVT_BLK_ADDRESS_V0 0x1f40
++#define ACPI_PM1A_CNT_BLK_ADDRESS_V0 (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x04)
++#define ACPI_PM_TMR_BLK_ADDRESS_V0   (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x08)
++#define ACPI_GPE0_BLK_ADDRESS_V0     (ACPI_PM_TMR_BLK_ADDRESS_V0 + 0x20)
++#define ACPI_GPE0_BLK_LEN_V0         0x08
++
++/* Version 1: Locations preferred by modern Qemu. */
++#define ACPI_PM1A_EVT_BLK_ADDRESS_V1 0xb000
++#define ACPI_PM1A_CNT_BLK_ADDRESS_V1 (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x04)
++#define ACPI_PM_TMR_BLK_ADDRESS_V1   (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x08)
++#define ACPI_GPE0_BLK_ADDRESS_V1     0xafe0
++#define ACPI_GPE0_BLK_LEN_V1         0x04
++
++/* Compatibility definitions for the default location (version 0). */
++#define ACPI_PM1A_EVT_BLK_ADDRESS    ACPI_PM1A_EVT_BLK_ADDRESS_V0
++#define ACPI_PM1A_CNT_BLK_ADDRESS    ACPI_PM1A_CNT_BLK_ADDRESS_V0
++#define ACPI_PM_TMR_BLK_ADDRESS      ACPI_PM_TMR_BLK_ADDRESS_V0
++#define ACPI_GPE0_BLK_ADDRESS        ACPI_GPE0_BLK_ADDRESS_V0
++#define ACPI_GPE0_BLK_LEN            ACPI_GPE0_BLK_LEN_V0
++
++
++#endif /* _IOREQ_H_ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/hvm/params.h

index 1888d8c,1888d8c..e9df0be
--- 1/include/xen/interface/hvm/params.h
--- 2/include/xen/interface/hvm/params.h
+++ b/include/xen/interface/hvm/params.h
@@@ -33,11 -33,11 +33,17 @@@
    * val[63:56] == 1: val[55:0] is a delivery PCI INTx line, as follows:
    *                  Domain = val[47:32], Bus  = val[31:16],
    *                  DevFn  = val[15: 8], IntX = val[ 1: 0]
-- * val[63:56] == 2: val[7:0] is a vector number.
++ * val[63:56] == 2: val[7:0] is a vector number, check for
++ *                  XENFEAT_hvm_callback_vector to know if this delivery
++ *                  method is available.
    * If val == 0 then CPU0 event-channel notifications are not delivered.
    */
   #define HVM_PARAM_CALLBACK_IRQ 0
   
++/*
++ * These are not used by Xen. They are here for convenience of HVM-guest
++ * xenbus implementations.
++ */
   #define HVM_PARAM_STORE_PFN    1
   #define HVM_PARAM_STORE_EVTCHN 2
   
@@@ -47,6 -47,6 +53,19 @@@
   
   #define HVM_PARAM_BUFIOREQ_PFN 6
   
++#ifdef __ia64__
++
++#define HVM_PARAM_NVRAM_FD     7
++#define HVM_PARAM_VHPT_SIZE    8
++#define HVM_PARAM_BUFPIOREQ_PFN       9
++
++#elif defined(__i386__) || defined(__x86_64__)
++
++/* Expose Viridian interfaces to this HVM guest? */
++#define HVM_PARAM_VIRIDIAN     9
++
++#endif
++
   /*
    * Set mode for virtual timers (currently x86 only):
    *  delay_for_missed_ticks (default):
@@@ -90,6 -90,6 +109,37 @@@
   /* Boolean: Enable aligning all periodic vpts to reduce interrupts */
   #define HVM_PARAM_VPT_ALIGN    16
   
--#define HVM_NR_PARAMS          17
++/* Console debug shared memory ring and event channel */
++#define HVM_PARAM_CONSOLE_PFN    17
++#define HVM_PARAM_CONSOLE_EVTCHN 18
++
++/*
++ * Select location of ACPI PM1a and TMR control blocks. Currently two locations
++ * are supported, specified by version 0 or 1 in this parameter:
++ *   - 0: default, use the old addresses
++ *        PM1A_EVT == 0x1f40; PM1A_CNT == 0x1f44; PM_TMR == 0x1f48
++ *   - 1: use the new default qemu addresses
++ *        PM1A_EVT == 0xb000; PM1A_CNT == 0xb004; PM_TMR == 0xb008
++ * You can find these address definitions in <hvm/ioreq.h>
++ */
++#define HVM_PARAM_ACPI_IOPORTS_LOCATION 19
++
++/* Enable blocking memory events, async or sync (pause vcpu until response)
++ * onchangeonly indicates messages only on a change of value */
++#define HVM_PARAM_MEMORY_EVENT_CR0   20
++#define HVM_PARAM_MEMORY_EVENT_CR3   21
++#define HVM_PARAM_MEMORY_EVENT_CR4   22
++#define HVM_PARAM_MEMORY_EVENT_INT3  23
++
++#define HVMPME_MODE_MASK       (3 << 0)
++#define HVMPME_mode_disabled   0
++#define HVMPME_mode_async      1
++#define HVMPME_mode_sync       2
++#define HVMPME_onchangeonly    (1 << 2)
++
++/* Boolean: Enable nestedhvm (hvm only) */
++#define HVM_PARAM_NESTEDHVM    24
++
++#define HVM_NR_PARAMS          25
   
   #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff --cc include/xen/interface/hvm/save.h

index 0000000,0000000..8a4c538

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/hvm/save.h
@@@ -1,0 -1,0 +1,110 @@@
++/* 
++ * hvm/save.h
++ *
++ * Structure definitions for HVM state that is held by Xen and must
++ * be saved along with the domain's memory and device-model state.
++ * 
++ * Copyright (c) 2007 XenSource Ltd.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_PUBLIC_HVM_SAVE_H__
++#define __XEN_PUBLIC_HVM_SAVE_H__
++
++/*
++ * Structures in this header *must* have the same layout in 32bit 
++ * and 64bit environments: this means that all fields must be explicitly 
++ * sized types and aligned to their sizes, and the structs must be 
++ * a multiple of eight bytes long.
++ *
++ * Only the state necessary for saving and restoring (i.e. fields 
++ * that are analogous to actual hardware state) should go in this file. 
++ * Internal mechanisms should be kept in Xen-private headers.
++ */
++
++#if !defined(__GNUC__) || defined(__STRICT_ANSI__)
++#error "Anonymous structs/unions are a GNU extension."
++#endif
++
++/* 
++ * Each entry is preceded by a descriptor giving its type and length
++ */
++struct hvm_save_descriptor {
++    uint16_t typecode;          /* Used to demux the various types below */
++    uint16_t instance;          /* Further demux within a type */
++    uint32_t length;            /* In bytes, *not* including this descriptor */
++};
++
++
++/* 
++ * Each entry has a datatype associated with it: for example, the CPU state 
++ * is saved as a HVM_SAVE_TYPE(CPU), which has HVM_SAVE_LENGTH(CPU), 
++ * and is identified by a descriptor with typecode HVM_SAVE_CODE(CPU).
++ * DECLARE_HVM_SAVE_TYPE binds these things together with some type-system
++ * ugliness.
++ */
++
++#ifdef __XEN__
++# define DECLARE_HVM_SAVE_TYPE_COMPAT(_x, _code, _type, _ctype, _fix)     \
++    static inline int __HVM_SAVE_FIX_COMPAT_##_x(void *h) { return _fix(h); } \
++    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[2];}; \
++    struct __HVM_SAVE_TYPE_COMPAT_##_x { _ctype t; }                   
++
++# define DECLARE_HVM_SAVE_TYPE(_x, _code, _type)                         \
++    static inline int __HVM_SAVE_FIX_COMPAT_##_x(void *h) { BUG(); return -1; } \
++    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[1];}; \
++    struct __HVM_SAVE_TYPE_COMPAT_##_x { _type t; }                   
++#else
++# define DECLARE_HVM_SAVE_TYPE_COMPAT(_x, _code, _type, _ctype, _fix)     \
++    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[2];} 
++
++# define DECLARE_HVM_SAVE_TYPE(_x, _code, _type)                         \
++    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[1];} 
++#endif
++
++#define HVM_SAVE_TYPE(_x) typeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->t)
++#define HVM_SAVE_LENGTH(_x) (sizeof (HVM_SAVE_TYPE(_x)))
++#define HVM_SAVE_CODE(_x) (sizeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->c))
++
++#ifdef __XEN__
++# define HVM_SAVE_TYPE_COMPAT(_x) typeof (((struct __HVM_SAVE_TYPE_COMPAT_##_x *)(0))->t)
++# define HVM_SAVE_LENGTH_COMPAT(_x) (sizeof (HVM_SAVE_TYPE_COMPAT(_x)))
++
++# define HVM_SAVE_HAS_COMPAT(_x) (sizeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->cpt)-1)
++# define HVM_SAVE_FIX_COMPAT(_x, _dst) __HVM_SAVE_FIX_COMPAT_##_x(_dst)
++#endif
++
++/* 
++ * The series of save records is teminated by a zero-type, zero-length 
++ * descriptor.
++ */
++
++struct hvm_save_end {};
++DECLARE_HVM_SAVE_TYPE(END, 0, struct hvm_save_end);
++
++#if defined(__i386__) || defined(__x86_64__)
++#include "../arch-x86/hvm/save.h"
++#elif defined(__ia64__)
++#include "../arch-ia64/hvm/save.h"
++#else
++#error "unsupported architecture"
++#endif
++
++#endif /* __XEN_PUBLIC_HVM_SAVE_H__ */
diff --cc include/xen/interface/io/blkif.h

index 3d5d6db,61e523a..c11a137
--- 1/include/xen/interface/io/blkif.h
--- 2/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@@ -3,6 -3,6 +3,24 @@@
    *
    * Unified block-device I/O interface for Xen guest OSes.
    *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Copyright (c) 2003-2004, Keir Fraser
    */
   
@@@ -24,8 -24,8 +42,10 @@@
    * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
    */
   
--typedef uint16_t blkif_vdev_t;
--typedef uint64_t blkif_sector_t;
++#ifndef blkif_vdev_t
++#define blkif_vdev_t   uint16_t
++#endif
++#define blkif_sector_t uint64_t
   
   /*
    * REQUEST CODES.
@@@ -34,7 -34,7 +54,7 @@@
   #define BLKIF_OP_WRITE             1
   /*
    * Recognised only if "feature-barrier" is present in backend xenbus info.
-- * The "feature_barrier" node contains a boolean indicating whether barrier
++ * The "feature-barrier" node contains a boolean indicating whether barrier
    * requests are likely to succeed or fail. Either way, a barrier request
    * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
    * the underlying block-device hardware. The boolean simply indicates whether
@@@ -43,52 -43,39 +63,104 @@@
    * create the "feature-barrier" node!
    */
   #define BLKIF_OP_WRITE_BARRIER     2
- 
+ +/*
+ + * Recognised if "feature-flush-cache" is present in backend xenbus
+ + * info.  A flush will ask the underlying storage hardware to flush its
+ + * non-volatile caches as appropriate.  The "feature-flush-cache" node
+ + * contains a boolean indicating whether flush requests are likely to
+ + * succeed or fail. Either way, a flush request may fail at any time
+ + * with BLKIF_RSP_EOPNOTSUPP if it is unsupported by the underlying
+ + * block-device hardware. The boolean simply indicates whether or not it
+ + * is worthwhile for the frontend to attempt flushes.  If a backend does
+ + * not recognise BLKIF_OP_WRITE_FLUSH_CACHE, it should *not* create the
+ + * "feature-flush-cache" node!
+ + */
+ +#define BLKIF_OP_FLUSH_DISKCACHE   3
+ +/*
++ * Device specific command packet contained within the request
++ */
++#define BLKIF_OP_PACKET            4
++/*
++ * Recognised only if "feature-trim" is present in backend xenbus info.
++ * The "feature-trim" node contains a boolean indicating whether trim
++ * requests are likely to succeed or fail. Either way, a trim request
++ * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
++ * the underlying block-device hardware. The boolean simply indicates whether
++ * or not it is worthwhile for the frontend to attempt trim requests.
++ * If a backend does not recognise BLKIF_OP_TRIM, it should *not*
++ * create the "feature-trim" node!
++ *
++ * Trim operation is a request for the underlying block device to mark
++ * extents to be erased. Trim operations are passed with sector_number as the
++ * sector index to begin trim operations at and nr_sectors as the number of
++ * sectors to be trimmed. The specified sectors should be trimmed if the
++ * underlying block device supports trim operations, or a BLKIF_RSP_EOPNOTSUPP
++ * should be returned. More information about trim operations at:
++ * http://t13.org/Documents/UploadedDocuments/docs2008/
++ *     e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc
++ */
++#define BLKIF_OP_TRIM              5
+ 
+ /*
    * Maximum scatter/gather segments per request.
-- * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
++ * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
    * NB. This could be 12 if the ring indexes weren't stored in the same page.
    */
   #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
   
--struct blkif_request_rw {
--      blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
--      struct blkif_request_segment {
--              grant_ref_t gref;        /* reference to I/O buffer frame        */
--              /* @first_sect: first sector in frame to transfer (inclusive).   */
--              /* @last_sect: last sector in frame to transfer (inclusive).     */
--              uint8_t     first_sect, last_sect;
--      } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++/*
++ * NB. first_sect and last_sect in blkif_request_segment, as well as
++ * sector_number in blkif_request, are always expressed in 512-byte units.
++ * However they must be properly aligned to the real sector size of the
++ * physical disk, which is reported in the "sector-size" node in the backend
++ * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units.
++ */
++struct blkif_request_segment {
++    grant_ref_t gref;        /* reference to I/O buffer frame        */
++    /* @first_sect: first sector in frame to transfer (inclusive).   */
++    /* @last_sect: last sector in frame to transfer (inclusive).     */
++    uint8_t     first_sect, last_sect;
   };
   
   struct blkif_request {
--      uint8_t        operation;    /* BLKIF_OP_???                         */
--      uint8_t        nr_segments;  /* number of segments                   */
--      blkif_vdev_t   handle;       /* only for read/write requests         */
--      uint64_t       id;           /* private guest value, echoed in resp  */
--      union {
--              struct blkif_request_rw rw;
--      } u;
++    uint8_t        operation;    /* BLKIF_OP_???                         */
++    uint8_t        nr_segments;  /* number of segments                   */
++    blkif_vdev_t   handle;       /* only for read/write requests         */
++    uint64_t       id;           /* private guest value, echoed in resp  */
++#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
++    blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
++    struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++#else
++    union {
++        struct blkif_request_rw {
++            blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
++            struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++        } rw;
++    } u;
++#endif
++};
++typedef struct blkif_request blkif_request_t;
++
++/*
++ * Cast to this structure when blkif_request.operation == BLKIF_OP_TRIM
++ * sizeof(struct blkif_request_trim) <= sizeof(struct blkif_request)
++ */
++struct blkif_request_trim {
++    uint8_t        operation;    /* BLKIF_OP_TRIM                        */
++    uint8_t        reserved;     /*                                      */
++    blkif_vdev_t   handle;       /* same as for read/write requests      */
++    uint64_t       id;           /* private guest value, echoed in resp  */
++    blkif_sector_t sector_number;/* start sector idx on disk             */
++    uint64_t       nr_sectors;   /* number of contiguous sectors to trim */
   };
++typedef struct blkif_request_trim blkif_request_trim_t;
   
   struct blkif_response {
--      uint64_t        id;              /* copied from request */
--      uint8_t         operation;       /* copied from request */
--      int16_t         status;          /* BLKIF_RSP_???       */
++    uint64_t        id;              /* copied from request */
++    uint8_t         operation;       /* copied from request */
++    int16_t         status;          /* BLKIF_RSP_???       */
   };
++typedef struct blkif_response blkif_response_t;
   
   /*
    * STATUS RETURN CODES.
diff --cc include/xen/interface/io/cdromif.h

index 0000000,0000000..b691056

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/io/cdromif.h
@@@ -1,0 -1,0 +1,120 @@@
++/******************************************************************************
++ * cdromif.h
++ *
++ * Shared definitions between backend driver and Xen guest Virtual CDROM
++ * block device.
++ *
++ * Copyright (c) 2008, Pat Campell  plc@novell.com
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_PUBLIC_IO_CDROMIF_H__
++#define __XEN_PUBLIC_IO_CDROMIF_H__
++
++/*
++ * Queries backend for CDROM support
++ */
++#define XEN_TYPE_CDROM_SUPPORT         _IO('c', 1)
++
++struct xen_cdrom_support
++{
++      uint32_t type;
++      int8_t ret;                  /* returned, 0 succeded, -1 error */
++      int8_t err;                  /* returned, backend errno */
++      int8_t supported;            /* returned, 1 supported */
++};
++
++/*
++ * Opens backend device, returns drive geometry or
++ * any encountered errors
++ */
++#define XEN_TYPE_CDROM_OPEN            _IO('c', 2)
++
++struct xen_cdrom_open
++{
++      uint32_t type;
++      int8_t ret;
++      int8_t err;
++      int8_t pad;
++      int8_t media_present;        /* returned */
++      uint32_t sectors;            /* returned */
++      uint32_t sector_size;        /* returned */
++      int32_t payload_offset;      /* offset to backend node name payload */
++};
++
++/*
++ * Queries backend for media changed status
++ */
++#define XEN_TYPE_CDROM_MEDIA_CHANGED   _IO('c', 3)
++
++struct xen_cdrom_media_changed
++{
++      uint32_t type;
++      int8_t ret;
++      int8_t err;
++      int8_t media_changed;        /* returned */
++};
++
++/*
++ * Sends vcd generic CDROM packet to backend, followed
++ * immediately by the vcd_generic_command payload
++ */
++#define XEN_TYPE_CDROM_PACKET          _IO('c', 4)
++
++struct xen_cdrom_packet
++{
++      uint32_t type;
++      int8_t ret;
++      int8_t err;
++      int8_t pad[2];
++      int32_t payload_offset;      /* offset to vcd_generic_command payload */
++};
++
++/* CDROM_PACKET_COMMAND, payload for XEN_TYPE_CDROM_PACKET */
++struct vcd_generic_command
++{
++      uint8_t  cmd[CDROM_PACKET_SIZE];
++      uint8_t  pad[4];
++      uint32_t buffer_offset;
++      uint32_t buflen;
++      int32_t  stat;
++      uint32_t sense_offset;
++      uint8_t  data_direction;
++      uint8_t  pad1[3];
++      int32_t  quiet;
++      int32_t  timeout;
++};
++
++union xen_block_packet
++{
++      uint32_t type;
++      struct xen_cdrom_support xcs;
++      struct xen_cdrom_open xco;
++      struct xen_cdrom_media_changed xcmc;
++      struct xen_cdrom_packet xcp;
++};
++
++#define PACKET_PAYLOAD_OFFSET (sizeof(struct xen_cdrom_packet))
++#define PACKET_SENSE_OFFSET (PACKET_PAYLOAD_OFFSET + sizeof(struct vcd_generic_command))
++#define PACKET_BUFFER_OFFSET (PACKET_SENSE_OFFSET + sizeof(struct request_sense))
++#define MAX_PACKET_DATA (PAGE_SIZE - sizeof(struct xen_cdrom_packet) - \
++            sizeof(struct vcd_generic_command) - sizeof(struct request_sense))
++
++#endif
diff --cc include/xen/interface/io/console.h

index e563de7,e563de7..70906df
--- 1/include/xen/interface/io/console.h
--- 2/include/xen/interface/io/console.h
+++ b/include/xen/interface/io/console.h
@@@ -3,6 -3,6 +3,24 @@@
    *
    * Console I/O interface for Xen guest OSes.
    *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Copyright (c) 2005, Keir Fraser
    */
   
diff --cc include/xen/interface/io/fbif.h

index 974a51e,974a51e..7071586
--- 1/include/xen/interface/io/fbif.h
--- 2/include/xen/interface/io/fbif.h
+++ b/include/xen/interface/io/fbif.h
@@@ -41,12 -41,12 +41,13 @@@
    */
   #define XENFB_TYPE_UPDATE 2
   
--struct xenfb_update {
--      uint8_t type;           /* XENFB_TYPE_UPDATE */
--      int32_t x;              /* source x */
--      int32_t y;              /* source y */
--      int32_t width;          /* rect width */
--      int32_t height;         /* rect height */
++struct xenfb_update
++{
++    uint8_t type;    /* XENFB_TYPE_UPDATE */
++    int32_t x;      /* source x */
++    int32_t y;      /* source y */
++    int32_t width;  /* rect width */
++    int32_t height; /* rect height */
   };
   
   /*
@@@ -55,36 -55,36 +56,58 @@@
    */
   #define XENFB_TYPE_RESIZE 3
   
--struct xenfb_resize {
--      uint8_t type;           /* XENFB_TYPE_RESIZE */
--      int32_t width;          /* width in pixels */
--      int32_t height;         /* height in pixels */
--      int32_t stride;         /* stride in bytes */
--      int32_t depth;          /* depth in bits */
--      int32_t offset;         /* start offset within framebuffer */
++struct xenfb_resize
++{
++    uint8_t type;    /* XENFB_TYPE_RESIZE */
++    int32_t width;   /* width in pixels */
++    int32_t height;  /* height in pixels */
++    int32_t stride;  /* stride in bytes */
++    int32_t depth;   /* depth in bits */
++    int32_t offset;  /* offset of the framebuffer in bytes */
   };
   
   #define XENFB_OUT_EVENT_SIZE 40
   
--union xenfb_out_event {
--      uint8_t type;
--      struct xenfb_update update;
--      struct xenfb_resize resize;
--      char pad[XENFB_OUT_EVENT_SIZE];
++union xenfb_out_event
++{
++    uint8_t type;
++    struct xenfb_update update;
++    struct xenfb_resize resize;
++    char pad[XENFB_OUT_EVENT_SIZE];
   };
   
   /* In events (backend -> frontend) */
   
   /*
    * Frontends should ignore unknown in events.
-- * No in events currently defined.
    */
   
++/*
++ * Framebuffer refresh period advice
++ * Backend sends it to advise the frontend their preferred period of
++ * refresh.  Frontends that keep the framebuffer constantly up-to-date
++ * just ignore it.  Frontends that use the advice should immediately
++ * refresh the framebuffer (and send an update notification event if
++ * those have been requested), then use the update frequency to guide
++ * their periodical refreshs.
++ */
++#define XENFB_TYPE_REFRESH_PERIOD 1
++#define XENFB_NO_REFRESH 0
++
++struct xenfb_refresh_period
++{
++    uint8_t type;    /* XENFB_TYPE_UPDATE_PERIOD */
++    uint32_t period; /* period of refresh, in ms,
++                      * XENFB_NO_REFRESH if no refresh is needed */
++};
++
   #define XENFB_IN_EVENT_SIZE 40
   
--union xenfb_in_event {
--      uint8_t type;
--      char pad[XENFB_IN_EVENT_SIZE];
++union xenfb_in_event
++{
++    uint8_t type;
++    struct xenfb_refresh_period refresh_period;
++    char pad[XENFB_IN_EVENT_SIZE];
   };
   
   /* shared page */
@@@ -93,41 -93,41 +116,46 @@@
   #define XENFB_IN_RING_LEN (XENFB_IN_RING_SIZE / XENFB_IN_EVENT_SIZE)
   #define XENFB_IN_RING_OFFS 1024
   #define XENFB_IN_RING(page) \
--      ((union xenfb_in_event *)((char *)(page) + XENFB_IN_RING_OFFS))
++    ((union xenfb_in_event *)((char *)(page) + XENFB_IN_RING_OFFS))
   #define XENFB_IN_RING_REF(page, idx) \
--      (XENFB_IN_RING((page))[(idx) % XENFB_IN_RING_LEN])
++    (XENFB_IN_RING((page))[(idx) % XENFB_IN_RING_LEN])
   
   #define XENFB_OUT_RING_SIZE 2048
   #define XENFB_OUT_RING_LEN (XENFB_OUT_RING_SIZE / XENFB_OUT_EVENT_SIZE)
   #define XENFB_OUT_RING_OFFS (XENFB_IN_RING_OFFS + XENFB_IN_RING_SIZE)
   #define XENFB_OUT_RING(page) \
--      ((union xenfb_out_event *)((char *)(page) + XENFB_OUT_RING_OFFS))
++    ((union xenfb_out_event *)((char *)(page) + XENFB_OUT_RING_OFFS))
   #define XENFB_OUT_RING_REF(page, idx) \
--      (XENFB_OUT_RING((page))[(idx) % XENFB_OUT_RING_LEN])
--
--struct xenfb_page {
--      uint32_t in_cons, in_prod;
--      uint32_t out_cons, out_prod;
--
--      int32_t width;          /* width of the framebuffer (in pixels) */
--      int32_t height;         /* height of the framebuffer (in pixels) */
--      uint32_t line_length;   /* length of a row of pixels (in bytes) */
--      uint32_t mem_length;    /* length of the framebuffer (in bytes) */
--      uint8_t depth;          /* depth of a pixel (in bits) */
--
--      /*
--       * Framebuffer page directory
--       *
--       * Each directory page holds PAGE_SIZE / sizeof(*pd)
--       * framebuffer pages, and can thus map up to PAGE_SIZE *
--       * PAGE_SIZE / sizeof(*pd) bytes.  With PAGE_SIZE == 4096 and
--       * sizeof(unsigned long) == 4/8, that's 4 Megs 32 bit and 2
--       * Megs 64 bit.  256 directories give enough room for a 512
--       * Meg framebuffer with a max resolution of 12,800x10,240.
--       * Should be enough for a while with room leftover for
--       * expansion.
--       */
--      unsigned long pd[256];
++    (XENFB_OUT_RING((page))[(idx) % XENFB_OUT_RING_LEN])
++
++struct xenfb_page
++{
++    uint32_t in_cons, in_prod;
++    uint32_t out_cons, out_prod;
++
++    int32_t width;          /* the width of the framebuffer (in pixels) */
++    int32_t height;         /* the height of the framebuffer (in pixels) */
++    uint32_t line_length;   /* the length of a row of pixels (in bytes) */
++    uint32_t mem_length;    /* the length of the framebuffer (in bytes) */
++    uint8_t depth;          /* the depth of a pixel (in bits) */
++
++    /*
++     * Framebuffer page directory
++     *
++     * Each directory page holds PAGE_SIZE / sizeof(*pd)
++     * framebuffer pages, and can thus map up to PAGE_SIZE *
++     * PAGE_SIZE / sizeof(*pd) bytes.  With PAGE_SIZE == 4096 and
++     * sizeof(unsigned long) == 4/8, that's 4 Megs 32 bit and 2 Megs
++     * 64 bit.  256 directories give enough room for a 512 Meg
++     * framebuffer with a max resolution of 12,800x10,240.  Should
++     * be enough for a while with room leftover for expansion.
++     */
++#ifndef CONFIG_PARAVIRT_XEN
++    unsigned long pd[256];
++#else
++      /* Two directory pages should be enough for a while. */
++      unsigned long pd[2];
++#endif
   };
   
   /*
diff --cc include/xen/interface/io/fsif.h

index 0000000,0000000..8fc2174

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/io/fsif.h
@@@ -1,0 -1,0 +1,192 @@@
++/******************************************************************************
++ * fsif.h
++ * 
++ * Interface to FS level split device drivers.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2007, Grzegorz Milos, <gm281@cam.ac.uk>.
++ */
++
++#ifndef __XEN_PUBLIC_IO_FSIF_H__
++#define __XEN_PUBLIC_IO_FSIF_H__
++
++#include "ring.h"
++#include "../grant_table.h"
++
++#define REQ_FILE_OPEN        1
++#define REQ_FILE_CLOSE       2
++#define REQ_FILE_READ        3
++#define REQ_FILE_WRITE       4
++#define REQ_STAT             5
++#define REQ_FILE_TRUNCATE    6
++#define REQ_REMOVE           7
++#define REQ_RENAME           8
++#define REQ_CREATE           9
++#define REQ_DIR_LIST        10
++#define REQ_CHMOD           11
++#define REQ_FS_SPACE        12
++#define REQ_FILE_SYNC       13
++
++struct fsif_open_request {
++    grant_ref_t gref;
++};
++
++struct fsif_close_request {
++    uint32_t fd;
++};
++
++struct fsif_read_request {
++    uint32_t fd;
++    int32_t pad;
++    uint64_t len;
++    uint64_t offset;
++    grant_ref_t grefs[1];  /* Variable length */
++};
++
++struct fsif_write_request {
++    uint32_t fd;
++    int32_t pad;
++    uint64_t len;
++    uint64_t offset;
++    grant_ref_t grefs[1];  /* Variable length */
++};
++
++struct fsif_stat_request {
++    uint32_t fd;
++};
++
++/* This structure is a copy of some fields from stat structure, returned
++ * via the ring. */
++struct fsif_stat_response {
++    int32_t  stat_mode;
++    uint32_t stat_uid;
++    uint32_t stat_gid;
++    int32_t  stat_ret;
++    int64_t  stat_size;
++    int64_t  stat_atime;
++    int64_t  stat_mtime;
++    int64_t  stat_ctime;
++};
++
++struct fsif_truncate_request {
++    uint32_t fd;
++    int32_t pad;
++    int64_t length;
++};
++
++struct fsif_remove_request {
++    grant_ref_t gref;
++};
++
++struct fsif_rename_request {
++    uint16_t old_name_offset;
++    uint16_t new_name_offset;
++    grant_ref_t gref;
++};
++
++struct fsif_create_request {
++    int8_t directory;
++    int8_t pad;
++    int16_t pad2;
++    int32_t mode;
++    grant_ref_t gref;
++};
++
++struct fsif_list_request {
++    uint32_t offset;
++    grant_ref_t gref;
++};
++
++#define NR_FILES_SHIFT  0
++#define NR_FILES_SIZE   16   /* 16 bits for the number of files mask */
++#define NR_FILES_MASK   (((1ULL << NR_FILES_SIZE) - 1) << NR_FILES_SHIFT)
++#define ERROR_SIZE      32   /* 32 bits for the error mask */
++#define ERROR_SHIFT     (NR_FILES_SIZE + NR_FILES_SHIFT)
++#define ERROR_MASK      (((1ULL << ERROR_SIZE) - 1) << ERROR_SHIFT)
++#define HAS_MORE_SHIFT  (ERROR_SHIFT + ERROR_SIZE)    
++#define HAS_MORE_FLAG   (1ULL << HAS_MORE_SHIFT)
++
++struct fsif_chmod_request {
++    uint32_t fd;
++    int32_t mode;
++};
++
++struct fsif_space_request {
++    grant_ref_t gref;
++};
++
++struct fsif_sync_request {
++    uint32_t fd;
++};
++
++
++/* FS operation request */
++struct fsif_request {
++    uint8_t type;                 /* Type of the request                  */
++    uint8_t pad;
++    uint16_t id;                  /* Request ID, copied to the response   */
++    uint32_t pad2;
++    union {
++        struct fsif_open_request     fopen;
++        struct fsif_close_request    fclose;
++        struct fsif_read_request     fread;
++        struct fsif_write_request    fwrite;
++        struct fsif_stat_request     fstat;
++        struct fsif_truncate_request ftruncate;
++        struct fsif_remove_request   fremove;
++        struct fsif_rename_request   frename;
++        struct fsif_create_request   fcreate;
++        struct fsif_list_request     flist;
++        struct fsif_chmod_request    fchmod;
++        struct fsif_space_request    fspace;
++        struct fsif_sync_request     fsync;
++    } u;
++};
++typedef struct fsif_request fsif_request_t;
++
++/* FS operation response */
++struct fsif_response {
++    uint16_t id;
++    uint16_t pad1;
++    uint32_t pad2;
++    union {
++        uint64_t ret_val;
++        struct fsif_stat_response fstat;
++    } u;
++};
++
++typedef struct fsif_response fsif_response_t;
++
++#define FSIF_RING_ENTRY_SIZE   64
++
++#define FSIF_NR_READ_GNTS  ((FSIF_RING_ENTRY_SIZE - sizeof(struct fsif_read_request)) /  \
++                                sizeof(grant_ref_t) + 1)
++#define FSIF_NR_WRITE_GNTS ((FSIF_RING_ENTRY_SIZE - sizeof(struct fsif_write_request)) / \
++                                sizeof(grant_ref_t) + 1)
++
++DEFINE_RING_TYPES(fsif, struct fsif_request, struct fsif_response);
++
++#define STATE_INITIALISED     "init"
++#define STATE_READY           "ready"
++#define STATE_CLOSING         "closing"
++#define STATE_CLOSED          "closed"
++
++
++#endif
diff --cc include/xen/interface/io/kbdif.h

index 8066c78,8066c78..39bae65
--- 1/include/xen/interface/io/kbdif.h
--- 2/include/xen/interface/io/kbdif.h
+++ b/include/xen/interface/io/kbdif.h
@@@ -45,34 -45,34 +45,38 @@@
    */
   #define XENKBD_TYPE_POS     4
   
--struct xenkbd_motion {
--      uint8_t type;           /* XENKBD_TYPE_MOTION */
--      int32_t rel_x;          /* relative X motion */
--      int32_t rel_y;          /* relative Y motion */
--      int32_t rel_z;          /* relative Z motion (wheel) */
++struct xenkbd_motion
++{
++    uint8_t type;        /* XENKBD_TYPE_MOTION */
++    int32_t rel_x;       /* relative X motion */
++    int32_t rel_y;       /* relative Y motion */
++    int32_t rel_z;       /* relative Z motion (wheel) */
   };
   
--struct xenkbd_key {
--      uint8_t type;           /* XENKBD_TYPE_KEY */
--      uint8_t pressed;        /* 1 if pressed; 0 otherwise */
--      uint32_t keycode;       /* KEY_* from linux/input.h */
++struct xenkbd_key
++{
++    uint8_t type;         /* XENKBD_TYPE_KEY */
++    uint8_t pressed;      /* 1 if pressed; 0 otherwise */
++    uint32_t keycode;     /* KEY_* from linux/input.h */
   };
   
--struct xenkbd_position {
--      uint8_t type;           /* XENKBD_TYPE_POS */
--      int32_t abs_x;          /* absolute X position (in FB pixels) */
--      int32_t abs_y;          /* absolute Y position (in FB pixels) */
--      int32_t rel_z;          /* relative Z motion (wheel) */
++struct xenkbd_position
++{
++    uint8_t type;        /* XENKBD_TYPE_POS */
++    int32_t abs_x;       /* absolute X position (in FB pixels) */
++    int32_t abs_y;       /* absolute Y position (in FB pixels) */
++    int32_t rel_z;       /* relative Z motion (wheel) */
   };
   
   #define XENKBD_IN_EVENT_SIZE 40
   
--union xenkbd_in_event {
--      uint8_t type;
--      struct xenkbd_motion motion;
--      struct xenkbd_key key;
--      struct xenkbd_position pos;
--      char pad[XENKBD_IN_EVENT_SIZE];
++union xenkbd_in_event
++{
++    uint8_t type;
++    struct xenkbd_motion motion;
++    struct xenkbd_key key;
++    struct xenkbd_position pos;
++    char pad[XENKBD_IN_EVENT_SIZE];
   };
   
   /* Out events (frontend -> backend) */
@@@ -85,9 -85,9 +89,10 @@@
   
   #define XENKBD_OUT_EVENT_SIZE 40
   
--union xenkbd_out_event {
--      uint8_t type;
--      char pad[XENKBD_OUT_EVENT_SIZE];
++union xenkbd_out_event
++{
++    uint8_t type;
++    char pad[XENKBD_OUT_EVENT_SIZE];
   };
   
   /* shared page */
@@@ -96,21 -96,21 +101,22 @@@
   #define XENKBD_IN_RING_LEN (XENKBD_IN_RING_SIZE / XENKBD_IN_EVENT_SIZE)
   #define XENKBD_IN_RING_OFFS 1024
   #define XENKBD_IN_RING(page) \
--      ((union xenkbd_in_event *)((char *)(page) + XENKBD_IN_RING_OFFS))
++    ((union xenkbd_in_event *)((char *)(page) + XENKBD_IN_RING_OFFS))
   #define XENKBD_IN_RING_REF(page, idx) \
--      (XENKBD_IN_RING((page))[(idx) % XENKBD_IN_RING_LEN])
++    (XENKBD_IN_RING((page))[(idx) % XENKBD_IN_RING_LEN])
   
   #define XENKBD_OUT_RING_SIZE 1024
   #define XENKBD_OUT_RING_LEN (XENKBD_OUT_RING_SIZE / XENKBD_OUT_EVENT_SIZE)
   #define XENKBD_OUT_RING_OFFS (XENKBD_IN_RING_OFFS + XENKBD_IN_RING_SIZE)
   #define XENKBD_OUT_RING(page) \
--      ((union xenkbd_out_event *)((char *)(page) + XENKBD_OUT_RING_OFFS))
++    ((union xenkbd_out_event *)((char *)(page) + XENKBD_OUT_RING_OFFS))
   #define XENKBD_OUT_RING_REF(page, idx) \
--      (XENKBD_OUT_RING((page))[(idx) % XENKBD_OUT_RING_LEN])
++    (XENKBD_OUT_RING((page))[(idx) % XENKBD_OUT_RING_LEN])
   
--struct xenkbd_page {
--      uint32_t in_cons, in_prod;
--      uint32_t out_cons, out_prod;
++struct xenkbd_page
++{
++    uint32_t in_cons, in_prod;
++    uint32_t out_cons, out_prod;
   };
   
   #endif
diff --cc include/xen/interface/io/netif.h

index cb94668,cb94668..f513b7f
--- 1/include/xen/interface/io/netif.h
--- 2/include/xen/interface/io/netif.h
+++ b/include/xen/interface/io/netif.h
@@@ -3,6 -3,6 +3,24 @@@
    *
    * Unified network-device I/O interface for Xen guest OSes.
    *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Copyright (c) 2003-2004, Keir Fraser
    */
   
@@@ -47,18 -47,18 +65,21 @@@
   #define _XEN_NETTXF_extra_info                (3)
   #define  XEN_NETTXF_extra_info                (1U<<_XEN_NETTXF_extra_info)
   
--struct xen_netif_tx_request {
++struct netif_tx_request {
       grant_ref_t gref;      /* Reference to buffer page */
       uint16_t offset;       /* Offset within buffer page */
       uint16_t flags;        /* XEN_NETTXF_* */
       uint16_t id;           /* Echoed in response message. */
       uint16_t size;         /* Packet size in bytes.       */
   };
++typedef struct netif_tx_request netif_tx_request_t;
   
--/* Types of xen_netif_extra_info descriptors. */
++/* Types of netif_extra_info descriptors. */
   #define XEN_NETIF_EXTRA_TYPE_NONE     (0)  /* Never used - invalid */
   #define XEN_NETIF_EXTRA_TYPE_GSO      (1)  /* u.gso */
--#define XEN_NETIF_EXTRA_TYPE_MAX      (2)
++#define XEN_NETIF_EXTRA_TYPE_MCAST_ADD        (2)  /* u.mcast */
++#define XEN_NETIF_EXTRA_TYPE_MCAST_DEL        (3)  /* u.mcast */
++#define XEN_NETIF_EXTRA_TYPE_MAX      (4)
   
   /* xen_netif_extra_info flags. */
   #define _XEN_NETIF_EXTRA_FLAG_MORE    (0)
@@@ -71,49 -71,49 +92,68 @@@
    * This structure needs to fit within both netif_tx_request and
    * netif_rx_response for compatibility.
    */
--struct xen_netif_extra_info {
--      uint8_t type;  /* XEN_NETIF_EXTRA_TYPE_* */
--      uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
--
--      union {
--              struct {
--                      /*
--                       * Maximum payload size of each segment. For
--                       * example, for TCP this is just the path MSS.
--                       */
--                      uint16_t size;
--
--                      /*
--                       * GSO type. This determines the protocol of
--                       * the packet and any extra features required
--                       * to segment the packet properly.
--                       */
--                      uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
--
--                      /* Future expansion. */
--                      uint8_t pad;
--
--                      /*
--                       * GSO features. This specifies any extra GSO
--                       * features required to process this packet,
--                       * such as ECN support for TCPv4.
--                       */
--                      uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
--              } gso;
--
--              uint16_t pad[3];
--      } u;
++struct netif_extra_info {
++    uint8_t type;  /* XEN_NETIF_EXTRA_TYPE_* */
++    uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
++
++    union {
++        /*
++         * XEN_NETIF_EXTRA_TYPE_GSO:
++         */
++        struct {
++            /*
++             * Maximum payload size of each segment. For example, for TCP this
++             * is just the path MSS.
++             */
++            uint16_t size;
++
++            /*
++             * GSO type. This determines the protocol of the packet and any
++             * extra features required to segment the packet properly.
++             */
++            uint8_t type; /* XEN_NETIF_GSO_TYPE_* */
++
++            /* Future expansion. */
++            uint8_t pad;
++
++            /*
++             * GSO features. This specifies any extra GSO features required
++             * to process this packet, such as ECN support for TCPv4.
++             */
++            uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
++        } gso;
++
++        /*
++         * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}:
++         * Backend advertises availability via 'feature-multicast-control'
++         * xenbus node containing value '1'.
++         * Frontend requests this feature by advertising
++         * 'request-multicast-control' xenbus node containing value '1'.
++         * If multicast control is requested then multicast flooding is
++         * disabled and the frontend must explicitly register its interest
++         * in multicast groups using dummy transmit requests containing
++         * MCAST_{ADD,DEL} extra-info fragments.
++         */
++        struct {
++            uint8_t addr[6]; /* Address to add/remove. */
++        } mcast;
++
++        uint16_t pad[3];
++    } u;
   };
++typedef struct netif_extra_info netif_extra_info_t;
   
--struct xen_netif_tx_response {
--      uint16_t id;
--      int16_t  status;       /* XEN_NETIF_RSP_* */
++struct netif_tx_response {
++    uint16_t id;
++    int16_t  status;       /* XEN_NETIF_RSP_* */
   };
++typedef struct netif_tx_response netif_tx_response_t;
   
--struct xen_netif_rx_request {
--      uint16_t    id;        /* Echoed in response message.        */
--      grant_ref_t gref;      /* Reference to incoming granted frame */
++struct netif_rx_request {
++    uint16_t    id;        /* Echoed in response message.        */
++    grant_ref_t gref;      /* Reference to incoming granted frame */
   };
++typedef struct netif_rx_request netif_rx_request_t;
   
   /* Packet data has been validated against protocol checksum. */
   #define _XEN_NETRXF_data_validated    (0)
@@@ -135,28 -135,28 +175,39 @@@
   #define _XEN_NETRXF_gso_prefix                (4)
   #define  XEN_NETRXF_gso_prefix                (1U<<_XEN_NETRXF_gso_prefix)
   
--struct xen_netif_rx_response {
++struct netif_rx_response {
       uint16_t id;
       uint16_t offset;       /* Offset in page of start of received packet  */
       uint16_t flags;        /* XEN_NETRXF_* */
       int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
   };
++typedef struct netif_rx_response netif_rx_response_t;
   
   /*
    * Generate netif ring structures and types.
    */
   
++#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
++DEFINE_RING_TYPES(netif_tx, struct netif_tx_request, struct netif_tx_response);
++DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response);
++#else
++#define xen_netif_tx_request netif_tx_request
++#define xen_netif_rx_request netif_rx_request
++#define xen_netif_tx_response netif_tx_response
++#define xen_netif_rx_response netif_rx_response
   DEFINE_RING_TYPES(xen_netif_tx,
                   struct xen_netif_tx_request,
                   struct xen_netif_tx_response);
   DEFINE_RING_TYPES(xen_netif_rx,
                   struct xen_netif_rx_request,
                   struct xen_netif_rx_response);
++#define xen_netif_extra_info netif_extra_info
++#endif
   
   #define XEN_NETIF_RSP_DROPPED -2
   #define XEN_NETIF_RSP_ERROR   -1
   #define XEN_NETIF_RSP_OKAY     0
--/* No response: used for auxiliary requests (e.g., xen_netif_extra_info). */
++/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
   #define XEN_NETIF_RSP_NULL     1
   
   #endif
diff --cc include/xen/interface/io/protocols.h

index 01fc8ae,01fc8ae..5fbf10c
--- 1/include/xen/interface/io/protocols.h
--- 2/include/xen/interface/io/protocols.h
+++ b/include/xen/interface/io/protocols.h
@@@ -1,10 -1,10 +1,31 @@@
++/******************************************************************************
++ * protocols.h
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
   #ifndef __XEN_PROTOCOLS_H__
   #define __XEN_PROTOCOLS_H__
   
   #define XEN_IO_PROTO_ABI_X86_32     "x86_32-abi"
   #define XEN_IO_PROTO_ABI_X86_64     "x86_64-abi"
   #define XEN_IO_PROTO_ABI_IA64       "ia64-abi"
--#define XEN_IO_PROTO_ABI_POWERPC64  "powerpc64-abi"
   
   #if defined(__i386__)
   # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_32
@@@ -12,8 -12,8 +33,6 @@@
   # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_64
   #elif defined(__ia64__)
   # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_IA64
--#elif defined(__powerpc64__)
--# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_POWERPC64
   #else
   # error arch fixup needed here
   #endif
diff --cc include/xen/interface/io/ring.h

index 75271b9,75271b9..a513beb
--- 1/include/xen/interface/io/ring.h
--- 2/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
@@@ -3,16 -3,16 +3,42 @@@
    *
    * Shared producer-consumer ring macros.
    *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Tim Deegan and Andrew Warfield November 2004.
    */
   
   #ifndef __XEN_PUBLIC_IO_RING_H__
   #define __XEN_PUBLIC_IO_RING_H__
   
++#include "../xen-compat.h"
++
++#if __XEN_INTERFACE_VERSION__ < 0x00030208
++#define xen_mb()  mb()
++#define xen_rmb() rmb()
++#define xen_wmb() wmb()
++#endif
++
   typedef unsigned int RING_IDX;
   
   /* Round a 32-bit unsigned constant down to the nearest power of two. */
--#define __RD2(_x)  (((_x) & 0x00000002) ? 0x2                : ((_x) & 0x1))
++#define __RD2(_x)  (((_x) & 0x00000002) ? 0x2                  : ((_x) & 0x1))
   #define __RD4(_x)  (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2    : __RD2(_x))
   #define __RD8(_x)  (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4    : __RD4(_x))
   #define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8    : __RD8(_x))
@@@ -31,74 -31,74 +57,86 @@@
   /*
    * The same for passing in an actual pointer instead of a name tag.
    */
--#define __RING_SIZE(_s, _sz)                                          \
--      (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
++#define __RING_SIZE(_s, _sz) \
++    (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
   
   /*
    * Macros to make the correct C datatypes for a new kind of ring.
    *
    * To make a new ring datatype, you need to have two message structures,
-- * let's say struct request, and struct response already defined.
++ * let's say request_t, and response_t already defined.
    *
    * In a header where you want the ring datatype declared, you then do:
    *
-- *     DEFINE_RING_TYPES(mytag, struct request, struct response);
++ *     DEFINE_RING_TYPES(mytag, request_t, response_t);
    *
    * These expand out to give you a set of types, as you can see below.
    * The most important of these are:
    *
-- *     struct mytag_sring      - The shared ring.
-- *     struct mytag_front_ring - The 'front' half of the ring.
-- *     struct mytag_back_ring  - The 'back' half of the ring.
++ *     mytag_sring_t      - The shared ring.
++ *     mytag_front_ring_t - The 'front' half of the ring.
++ *     mytag_back_ring_t  - The 'back' half of the ring.
    *
    * To initialize a ring in your code you need to know the location and size
    * of the shared memory area (PAGE_SIZE, for instance). To initialise
    * the front half:
    *
-- *     struct mytag_front_ring front_ring;
-- *     SHARED_RING_INIT((struct mytag_sring *)shared_page);
-- *     FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page,
-- *                   PAGE_SIZE);
++ *     mytag_front_ring_t front_ring;
++ *     SHARED_RING_INIT((mytag_sring_t *)shared_page);
++ *     FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
    *
    * Initializing the back follows similarly (note that only the front
    * initializes the shared ring):
    *
-- *     struct mytag_back_ring back_ring;
-- *     BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page,
-- *                  PAGE_SIZE);
++ *     mytag_back_ring_t back_ring;
++ *     BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
    */
   
--#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)                   \
--                                                                      \
--/* Shared ring entry */                                                       \
--union __name##_sring_entry {                                          \
--    __req_t req;                                                      \
--    __rsp_t rsp;                                                      \
--};                                                                    \
--                                                                      \
--/* Shared ring page */                                                        \
--struct __name##_sring {                                                       \
--    RING_IDX req_prod, req_event;                                     \
--    RING_IDX rsp_prod, rsp_event;                                     \
--    uint8_t  pad[48];                                                 \
--    union __name##_sring_entry ring[1]; /* variable-length */         \
--};                                                                    \
--                                                                      \
--/* "Front" end's private variables */                                 \
--struct __name##_front_ring {                                          \
--    RING_IDX req_prod_pvt;                                            \
--    RING_IDX rsp_cons;                                                        \
--    unsigned int nr_ents;                                             \
--    struct __name##_sring *sring;                                     \
--};                                                                    \
--                                                                      \
--/* "Back" end's private variables */                                  \
--struct __name##_back_ring {                                           \
--    RING_IDX rsp_prod_pvt;                                            \
--    RING_IDX req_cons;                                                        \
--    unsigned int nr_ents;                                             \
--    struct __name##_sring *sring;                                     \
--};
++#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)                     \
++                                                                        \
++/* Shared ring entry */                                                 \
++union __name##_sring_entry {                                            \
++    __req_t req;                                                        \
++    __rsp_t rsp;                                                        \
++};                                                                      \
++                                                                        \
++/* Shared ring page */                                                  \
++struct __name##_sring {                                                 \
++    RING_IDX req_prod, req_event;                                       \
++    RING_IDX rsp_prod, rsp_event;                                       \
++    union {                                                             \
++        struct {                                                        \
++            uint8_t smartpoll_active;                                   \
++        } netif;                                                        \
++        struct {                                                        \
++            uint8_t msg;                                                \
++        } tapif_user;                                                   \
++        uint8_t pvt_pad[4];                                             \
++    } private;                                                          \
++    uint8_t __pad[44];                                                  \
++    union __name##_sring_entry ring[1]; /* variable-length */           \
++};                                                                      \
++                                                                        \
++/* "Front" end's private variables */                                   \
++struct __name##_front_ring {                                            \
++    RING_IDX req_prod_pvt;                                              \
++    RING_IDX rsp_cons;                                                  \
++    unsigned int nr_ents;                                               \
++    struct __name##_sring *sring;                                       \
++};                                                                      \
++                                                                        \
++/* "Back" end's private variables */                                    \
++struct __name##_back_ring {                                             \
++    RING_IDX rsp_prod_pvt;                                              \
++    RING_IDX req_cons;                                                  \
++    unsigned int nr_ents;                                               \
++    struct __name##_sring *sring;                                       \
++};                                                                      \
++                                                                        \
++/* Syntactic sugar */                                                   \
++typedef struct __name##_sring __name##_sring_t;                         \
++typedef struct __name##_front_ring __name##_front_ring_t;               \
++typedef struct __name##_back_ring __name##_back_ring_t
   
   /*
    * Macros for manipulating rings.
@@@ -116,86 -116,86 +154,95 @@@
    */
   
   /* Initialising empty rings */
--#define SHARED_RING_INIT(_s) do {                                     \
--    (_s)->req_prod  = (_s)->rsp_prod  = 0;                            \
--    (_s)->req_event = (_s)->rsp_event = 1;                            \
--    memset((_s)->pad, 0, sizeof((_s)->pad));                          \
++#define SHARED_RING_INIT(_s) do {                                       \
++    (_s)->req_prod  = (_s)->rsp_prod  = 0;                              \
++    (_s)->req_event = (_s)->rsp_event = 1;                              \
++    (void)memset((_s)->private.pvt_pad, 0, sizeof((_s)->private.pvt_pad)); \
++    (void)memset((_s)->__pad, 0, sizeof((_s)->__pad));                  \
   } while(0)
   
--#define FRONT_RING_INIT(_r, _s, __size) do {                          \
--    (_r)->req_prod_pvt = 0;                                           \
--    (_r)->rsp_cons = 0;                                                       \
--    (_r)->nr_ents = __RING_SIZE(_s, __size);                          \
--    (_r)->sring = (_s);                                                       \
++#define FRONT_RING_INIT(_r, _s, __size) do {                            \
++    (_r)->req_prod_pvt = 0;                                             \
++    (_r)->rsp_cons = 0;                                                 \
++    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
++    (_r)->sring = (_s);                                                 \
   } while (0)
   
--#define BACK_RING_INIT(_r, _s, __size) do {                           \
--    (_r)->rsp_prod_pvt = 0;                                           \
--    (_r)->req_cons = 0;                                                       \
--    (_r)->nr_ents = __RING_SIZE(_s, __size);                          \
--    (_r)->sring = (_s);                                                       \
++#define BACK_RING_INIT(_r, _s, __size) do {                             \
++    (_r)->rsp_prod_pvt = 0;                                             \
++    (_r)->req_cons = 0;                                                 \
++    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
++    (_r)->sring = (_s);                                                 \
   } while (0)
   
   /* Initialize to existing shared indexes -- for recovery */
--#define FRONT_RING_ATTACH(_r, _s, __size) do {                                \
--    (_r)->sring = (_s);                                                       \
--    (_r)->req_prod_pvt = (_s)->req_prod;                              \
--    (_r)->rsp_cons = (_s)->rsp_prod;                                  \
--    (_r)->nr_ents = __RING_SIZE(_s, __size);                          \
++#define FRONT_RING_ATTACH(_r, _s, __size) do {                          \
++    (_r)->sring = (_s);                                                 \
++    (_r)->req_prod_pvt = (_s)->req_prod;                                \
++    (_r)->rsp_cons = (_s)->rsp_prod;                                    \
++    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
   } while (0)
   
--#define BACK_RING_ATTACH(_r, _s, __size) do {                         \
--    (_r)->sring = (_s);                                                       \
--    (_r)->rsp_prod_pvt = (_s)->rsp_prod;                              \
--    (_r)->req_cons = (_s)->req_prod;                                  \
--    (_r)->nr_ents = __RING_SIZE(_s, __size);                          \
++#define BACK_RING_ATTACH(_r, _s, __size) do {                           \
++    (_r)->sring = (_s);                                                 \
++    (_r)->rsp_prod_pvt = (_s)->rsp_prod;                                \
++    (_r)->req_cons = (_s)->req_prod;                                    \
++    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
   } while (0)
   
   /* How big is this ring? */
--#define RING_SIZE(_r)                                                 \
++#define RING_SIZE(_r)                                                   \
       ((_r)->nr_ents)
   
   /* Number of free requests (for use on front side only). */
--#define RING_FREE_REQUESTS(_r)                                                \
++#define RING_FREE_REQUESTS(_r)                                          \
       (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
   
   /* Test if there is an empty slot available on the front ring.
    * (This is only meaningful from the front. )
    */
--#define RING_FULL(_r)                                                 \
++#define RING_FULL(_r)                                                   \
       (RING_FREE_REQUESTS(_r) == 0)
   
   /* Test if there are outstanding messages to be processed on a ring. */
--#define RING_HAS_UNCONSUMED_RESPONSES(_r)                             \
++#define RING_HAS_UNCONSUMED_RESPONSES(_r)                               \
       ((_r)->sring->rsp_prod - (_r)->rsp_cons)
   
--#define RING_HAS_UNCONSUMED_REQUESTS(_r)                              \
--    ({                                                                        \
--      unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;      \
--      unsigned int rsp = RING_SIZE(_r) -                              \
--                         ((_r)->req_cons - (_r)->rsp_prod_pvt);       \
--      req < rsp ? req : rsp;                                          \
--    })
++#ifdef __GNUC__
++#define RING_HAS_UNCONSUMED_REQUESTS(_r) ({                             \
++    unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;          \
++    unsigned int rsp = RING_SIZE(_r) -                                  \
++        ((_r)->req_cons - (_r)->rsp_prod_pvt);                          \
++    req < rsp ? req : rsp;                                              \
++})
++#else
++/* Same as above, but without the nice GCC ({ ... }) syntax. */
++#define RING_HAS_UNCONSUMED_REQUESTS(_r)                                \
++    ((((_r)->sring->req_prod - (_r)->req_cons) <                        \
++      (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) ?        \
++     ((_r)->sring->req_prod - (_r)->req_cons) :                         \
++     (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt)))
++#endif
   
   /* Direct access to individual ring elements, by index. */
--#define RING_GET_REQUEST(_r, _idx)                                    \
++#define RING_GET_REQUEST(_r, _idx)                                      \
       (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
   
--#define RING_GET_RESPONSE(_r, _idx)                                   \
++#define RING_GET_RESPONSE(_r, _idx)                                     \
       (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
   
   /* Loop termination condition: Would the specified index overflow the ring? */
--#define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                         \
++#define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                           \
       (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
   
--#define RING_PUSH_REQUESTS(_r) do {                                   \
--    wmb(); /* back sees requests /before/ updated producer index */   \
--    (_r)->sring->req_prod = (_r)->req_prod_pvt;                               \
++#define RING_PUSH_REQUESTS(_r) do {                                     \
++    xen_wmb(); /* back sees requests /before/ updated producer index */ \
++    (_r)->sring->req_prod = (_r)->req_prod_pvt;                         \
   } while (0)
   
--#define RING_PUSH_RESPONSES(_r) do {                                  \
--    wmb(); /* front sees responses /before/ updated producer index */ \
--    (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;                               \
++#define RING_PUSH_RESPONSES(_r) do {                                    \
++    xen_wmb(); /* front sees resps /before/ updated producer index */   \
++    (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;                         \
   } while (0)
   
   /*
@@@ -228,40 -228,40 +275,40 @@@
    *  field appropriately.
    */
   
--#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do {         \
--    RING_IDX __old = (_r)->sring->req_prod;                           \
--    RING_IDX __new = (_r)->req_prod_pvt;                              \
--    wmb(); /* back sees requests /before/ updated producer index */   \
--    (_r)->sring->req_prod = __new;                                    \
--    mb(); /* back sees new requests /before/ we check req_event */    \
--    (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <         \
--               (RING_IDX)(__new - __old));                            \
++#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do {           \
++    RING_IDX __old = (_r)->sring->req_prod;                             \
++    RING_IDX __new = (_r)->req_prod_pvt;                                \
++    xen_wmb(); /* back sees requests /before/ updated producer index */ \
++    (_r)->sring->req_prod = __new;                                      \
++    xen_mb(); /* back sees new requests /before/ we check req_event */  \
++    (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <           \
++                 (RING_IDX)(__new - __old));                            \
   } while (0)
   
--#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do {                \
--    RING_IDX __old = (_r)->sring->rsp_prod;                           \
--    RING_IDX __new = (_r)->rsp_prod_pvt;                              \
--    wmb(); /* front sees responses /before/ updated producer index */ \
--    (_r)->sring->rsp_prod = __new;                                    \
--    mb(); /* front sees new responses /before/ we check rsp_event */  \
--    (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <         \
--               (RING_IDX)(__new - __old));                            \
++#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do {          \
++    RING_IDX __old = (_r)->sring->rsp_prod;                             \
++    RING_IDX __new = (_r)->rsp_prod_pvt;                                \
++    xen_wmb(); /* front sees resps /before/ updated producer index */   \
++    (_r)->sring->rsp_prod = __new;                                      \
++    xen_mb(); /* front sees new resps /before/ we check rsp_event */    \
++    (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <           \
++                 (RING_IDX)(__new - __old));                            \
   } while (0)
   
--#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do {           \
--    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                 \
--    if (_work_to_do) break;                                           \
--    (_r)->sring->req_event = (_r)->req_cons + 1;                      \
--    mb();                                                             \
--    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                 \
++#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do {             \
++    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
++    if (_work_to_do) break;                                             \
++    (_r)->sring->req_event = (_r)->req_cons + 1;                        \
++    xen_mb();                                                           \
++    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
   } while (0)
   
--#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do {          \
--    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                        \
--    if (_work_to_do) break;                                           \
--    (_r)->sring->rsp_event = (_r)->rsp_cons + 1;                      \
--    mb();                                                             \
--    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                        \
++#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do {            \
++    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
++    if (_work_to_do) break;                                             \
++    (_r)->sring->rsp_event = (_r)->rsp_cons + 1;                        \
++    xen_mb();                                                           \
++    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
   } while (0)
   
   #endif /* __XEN_PUBLIC_IO_RING_H__ */
diff --cc include/xen/interface/io/tpmif.h

index 0000000,0000000..02ccdab

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/io/tpmif.h
@@@ -1,0 -1,0 +1,77 @@@
++/******************************************************************************
++ * tpmif.h
++ *
++ * TPM I/O interface for Xen guest OSes.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2005, IBM Corporation
++ *
++ * Author: Stefan Berger, stefanb@us.ibm.com
++ * Grant table support: Mahadevan Gomathisankaran
++ *
++ * This code has been derived from tools/libxc/xen/io/netif.h
++ *
++ * Copyright (c) 2003-2004, Keir Fraser
++ */
++
++#ifndef __XEN_PUBLIC_IO_TPMIF_H__
++#define __XEN_PUBLIC_IO_TPMIF_H__
++
++#include "../grant_table.h"
++
++struct tpmif_tx_request {
++    unsigned long addr;   /* Machine address of packet.   */
++    grant_ref_t ref;      /* grant table access reference */
++    uint16_t unused;
++    uint16_t size;        /* Packet size in bytes.        */
++};
++typedef struct tpmif_tx_request tpmif_tx_request_t;
++
++/*
++ * The TPMIF_TX_RING_SIZE defines the number of pages the
++ * front-end and backend can exchange (= size of array).
++ */
++typedef uint32_t TPMIF_RING_IDX;
++
++#define TPMIF_TX_RING_SIZE 1
++
++/* This structure must fit in a memory page. */
++
++struct tpmif_ring {
++    struct tpmif_tx_request req;
++};
++typedef struct tpmif_ring tpmif_ring_t;
++
++struct tpmif_tx_interface {
++    struct tpmif_ring ring[TPMIF_TX_RING_SIZE];
++};
++typedef struct tpmif_tx_interface tpmif_tx_interface_t;
++
++#endif
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/io/usbif.h

index 0000000,0000000..6099c29

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/io/usbif.h
@@@ -1,0 -1,0 +1,151 @@@
++/*
++ * usbif.h
++ *
++ * USB I/O interface for Xen guest OSes.
++ *
++ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
++ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_PUBLIC_IO_USBIF_H__
++#define __XEN_PUBLIC_IO_USBIF_H__
++
++#include "ring.h"
++#include "../grant_table.h"
++
++enum usb_spec_version {
++      USB_VER_UNKNOWN = 0,
++      USB_VER_USB11,
++      USB_VER_USB20,
++      USB_VER_USB30,  /* not supported yet */
++};
++
++/*
++ *  USB pipe in usbif_request
++ *
++ *  bits 0-5 are specific bits for virtual USB driver.
++ *  bits 7-31 are standard urb pipe.
++ *
++ *  - port number(NEW):       bits 0-4
++ *                            (USB_MAXCHILDREN is 31)
++ *
++ *  - operation flag(NEW):    bit 5
++ *                            (0 = submit urb,
++ *                             1 = unlink urb)
++ *
++ *  - direction:              bit 7
++ *                            (0 = Host-to-Device [Out]
++ *                           1 = Device-to-Host [In])
++ *
++ *  - device address: bits 8-14
++ *
++ *  - endpoint:               bits 15-18
++ *
++ *  - pipe type:              bits 30-31
++ *                            (00 = isochronous, 01 = interrupt,
++ *                           10 = control, 11 = bulk)
++ */
++#define usbif_pipeportnum(pipe) ((pipe) & 0x1f)
++#define usbif_setportnum_pipe(pipe, portnum) \
++      ((pipe)|(portnum))
++
++#define usbif_pipeunlink(pipe) ((pipe) & 0x20)
++#define usbif_pipesubmit(pipe) (!usbif_pipeunlink(pipe))
++#define usbif_setunlink_pipe(pipe) ((pipe)|(0x20))
++
++#define USBIF_BACK_MAX_PENDING_REQS (128)
++#define USBIF_MAX_SEGMENTS_PER_REQUEST (16)
++
++/*
++ * RING for transferring urbs.
++ */
++struct usbif_request_segment {
++      grant_ref_t gref;
++      uint16_t offset;
++      uint16_t length;
++};
++
++struct usbif_urb_request {
++      uint16_t id; /* request id */
++      uint16_t nr_buffer_segs; /* number of urb->transfer_buffer segments */
++
++      /* basic urb parameter */
++      uint32_t pipe;
++      uint16_t transfer_flags;
++      uint16_t buffer_length;
++      union {
++              uint8_t ctrl[8]; /* setup_packet (Ctrl) */
++
++              struct {
++                      uint16_t interval; /* maximum (1024*8) in usb core */
++                      uint16_t start_frame; /* start frame */
++                      uint16_t number_of_packets; /* number of ISO packet */
++                      uint16_t nr_frame_desc_segs; /* number of iso_frame_desc segments */
++              } isoc;
++
++              struct {
++                      uint16_t interval; /* maximum (1024*8) in usb core */
++                      uint16_t pad[3];
++              } intr;
++
++              struct {
++                      uint16_t unlink_id; /* unlink request id */
++                      uint16_t pad[3];
++              } unlink;
++
++      } u;
++
++      /* urb data segments */
++      struct usbif_request_segment seg[USBIF_MAX_SEGMENTS_PER_REQUEST];
++};
++typedef struct usbif_urb_request usbif_urb_request_t;
++
++struct usbif_urb_response {
++      uint16_t id; /* request id */
++      uint16_t start_frame;  /* start frame (ISO) */
++      int32_t status; /* status (non-ISO) */
++      int32_t actual_length; /* actual transfer length */
++      int32_t error_count; /* number of ISO errors */
++};
++typedef struct usbif_urb_response usbif_urb_response_t;
++
++DEFINE_RING_TYPES(usbif_urb, struct usbif_urb_request, struct usbif_urb_response);
++#define USB_URB_RING_SIZE __CONST_RING_SIZE(usbif_urb, PAGE_SIZE)
++
++/*
++ * RING for notifying connect/disconnect events to frontend
++ */
++struct usbif_conn_request {
++      uint16_t id;
++};
++typedef struct usbif_conn_request usbif_conn_request_t;
++
++struct usbif_conn_response {
++      uint16_t id; /* request id */
++      uint8_t portnum; /* port number */
++      uint8_t speed; /* usb_device_speed */
++};
++typedef struct usbif_conn_response usbif_conn_response_t;
++
++DEFINE_RING_TYPES(usbif_conn, struct usbif_conn_request, struct usbif_conn_response);
++#define USB_CONN_RING_SIZE __CONST_RING_SIZE(usbif_conn, PAGE_SIZE)
++
++#endif /* __XEN_PUBLIC_IO_USBIF_H__ */
diff --cc include/xen/interface/io/vscsiif.h

index 0000000,0000000..3ce2914

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/io/vscsiif.h
@@@ -1,0 -1,0 +1,105 @@@
++/******************************************************************************
++ * vscsiif.h
++ * 
++ * Based on the blkif.h code.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright(c) FUJITSU Limited 2008.
++ */
++
++#ifndef __XEN__PUBLIC_IO_SCSI_H__
++#define __XEN__PUBLIC_IO_SCSI_H__
++
++#include "ring.h"
++#include "../grant_table.h"
++
++/* command between backend and frontend */
++#define VSCSIIF_ACT_SCSI_CDB         1    /* SCSI CDB command */
++#define VSCSIIF_ACT_SCSI_ABORT       2    /* SCSI Device(Lun) Abort*/
++#define VSCSIIF_ACT_SCSI_RESET       3    /* SCSI Device(Lun) Reset*/
++
++
++#define VSCSIIF_BACK_MAX_PENDING_REQS    128
++
++/*
++ * Maximum scatter/gather segments per request.
++ *
++ * Considering balance between allocating al least 16 "vscsiif_request"
++ * structures on one page (4096bytes) and number of scatter gather 
++ * needed, we decided to use 26 as a magic number.
++ */
++#define VSCSIIF_SG_TABLESIZE             26
++
++/*
++ * base on linux kernel 2.6.18
++ */
++#define VSCSIIF_MAX_COMMAND_SIZE         16
++#define VSCSIIF_SENSE_BUFFERSIZE         96
++
++
++struct vscsiif_request {
++    uint16_t rqid;          /* private guest value, echoed in resp  */
++    uint8_t act;            /* command between backend and frontend */
++    uint8_t cmd_len;
++
++    uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];
++    uint16_t timeout_per_command;     /* The command is issued by twice 
++                                         the value in Backend. */
++    uint16_t channel, id, lun;
++    uint16_t padding;
++    uint8_t sc_data_direction;        /* for DMA_TO_DEVICE(1)
++                                         DMA_FROM_DEVICE(2)
++                                         DMA_NONE(3) requests  */
++    uint8_t nr_segments;              /* Number of pieces of scatter-gather */
++
++    struct scsiif_request_segment {
++        grant_ref_t gref;
++        uint16_t offset;
++        uint16_t length;
++    } seg[VSCSIIF_SG_TABLESIZE];
++    uint32_t reserved[3];
++};
++typedef struct vscsiif_request vscsiif_request_t;
++
++struct vscsiif_response {
++    uint16_t rqid;
++    uint8_t padding;
++    uint8_t sense_len;
++    uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
++    int32_t rslt;
++    uint32_t residual_len;     /* request bufflen - 
++                                  return the value from physical device */
++    uint32_t reserved[36];
++};
++typedef struct vscsiif_response vscsiif_response_t;
++
++DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response);
++
++
++#endif  /*__XEN__PUBLIC_IO_SCSI_H__*/
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/io/xenbus.h

index 9fda532,9fda532..b17afba
--- 1/include/xen/interface/io/xenbus.h
--- 2/include/xen/interface/io/xenbus.h
+++ b/include/xen/interface/io/xenbus.h
@@@ -36,6 -36,6 +36,7 @@@ enum xenbus_stat
   
         XenbusStateReconfigured  = 8
   };
++typedef enum xenbus_state XenbusState;
   
   #endif /* _XEN_PUBLIC_IO_XENBUS_H */
   
diff --cc include/xen/interface/io/xs_wire.h

index 99fcffb,99fcffb..4521433
--- 1/include/xen/interface/io/xs_wire.h
--- 2/include/xen/interface/io/xs_wire.h
+++ b/include/xen/interface/io/xs_wire.h
@@@ -1,6 -1,6 +1,25 @@@
   /*
    * Details of the "wire" protocol between Xen Store Daemon and client
    * library or guest kernel.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Copyright (C) 2005 Rusty Russell IBM Corporation
    */
   
@@@ -26,7 -26,7 +45,10 @@@ enum xsd_sockmsg_typ
       XS_SET_PERMS,
       XS_WATCH_EVENT,
       XS_ERROR,
--    XS_IS_DOMAIN_INTRODUCED
++    XS_IS_DOMAIN_INTRODUCED,
++    XS_RESUME,
++    XS_SET_TARGET,
++    XS_RESTRICT
   };
   
   #define XS_WRITE_NONE "NONE"
@@@ -39,8 -39,8 +61,14 @@@ struct xsd_error
       int errnum;
       const char *errstring;
   };
++#ifdef EINVAL
   #define XSD_ERROR(x) { x, #x }
--static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
++/* LINTED: static unused */
++static struct xsd_errors xsd_errors[]
++#if defined(__GNUC__)
++__attribute__((unused))
++#endif
++    = {
       XSD_ERROR(EINVAL),
       XSD_ERROR(EACCES),
       XSD_ERROR(EEXIST),
@@@ -56,6 -56,6 +84,7 @@@
       XSD_ERROR(EAGAIN),
       XSD_ERROR(EISCONN)
   };
++#endif
   
   struct xsd_sockmsg
   {
@@@ -84,4 -84,4 +113,11 @@@ struct xenstore_domain_interface 
       XENSTORE_RING_IDX rsp_cons, rsp_prod;
   };
   
++/* Violating this is very bad.  See docs/misc/xenstore.txt. */
++#define XENSTORE_PAYLOAD_MAX 4096
++
++/* Violating these just gets you an error back */
++#define XENSTORE_ABS_PATH_MAX 3072
++#define XENSTORE_REL_PATH_MAX 2048
++
   #endif /* _XS_WIRE_H */
diff --cc include/xen/interface/kexec.h

index 0000000,0000000..0425222

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/kexec.h
@@@ -1,0 -1,0 +1,168 @@@
++/******************************************************************************
++ * kexec.h - Public portion
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ * 
++ * Xen port written by:
++ * - Simon 'Horms' Horman <horms@verge.net.au>
++ * - Magnus Damm <magnus@valinux.co.jp>
++ */
++
++#ifndef _XEN_PUBLIC_KEXEC_H
++#define _XEN_PUBLIC_KEXEC_H
++
++
++/* This file describes the Kexec / Kdump hypercall interface for Xen.
++ *
++ * Kexec under vanilla Linux allows a user to reboot the physical machine 
++ * into a new user-specified kernel. The Xen port extends this idea
++ * to allow rebooting of the machine from dom0. When kexec for dom0
++ * is used to reboot,  both the hypervisor and the domains get replaced
++ * with some other kernel. It is possible to kexec between vanilla
++ * Linux and Xen and back again. Xen to Xen works well too.
++ *
++ * The hypercall interface for kexec can be divided into three main
++ * types of hypercall operations:
++ *
++ * 1) Range information:
++ *    This is used by the dom0 kernel to ask the hypervisor about various 
++ *    address information. This information is needed to allow kexec-tools 
++ *    to fill in the ELF headers for /proc/vmcore properly.
++ *
++ * 2) Load and unload of images:
++ *    There are no big surprises here, the kexec binary from kexec-tools
++ *    runs in userspace in dom0. The tool loads/unloads data into the
++ *    dom0 kernel such as new kernel, initramfs and hypervisor. When
++ *    loaded the dom0 kernel performs a load hypercall operation, and
++ *    before releasing all page references the dom0 kernel calls unload.
++ *
++ * 3) Kexec operation:
++ *    This is used to start a previously loaded kernel.
++ */
++
++#include "xen.h"
++
++#if defined(__i386__) || defined(__x86_64__)
++#define KEXEC_XEN_NO_PAGES 17
++#endif
++
++/*
++ * Prototype for this hypercall is:
++ *  int kexec_op(int cmd, void *args)
++ * @cmd  == KEXEC_CMD_... 
++ *          KEXEC operation to perform
++ * @args == Operation-specific extra arguments (NULL if none).
++ */
++
++/*
++ * Kexec supports two types of operation:
++ * - kexec into a regular kernel, very similar to a standard reboot
++ *   - KEXEC_TYPE_DEFAULT is used to specify this type
++ * - kexec into a special "crash kernel", aka kexec-on-panic
++ *   - KEXEC_TYPE_CRASH is used to specify this type
++ *   - parts of our system may be broken at kexec-on-panic time
++ *     - the code should be kept as simple and self-contained as possible
++ */
++
++#define KEXEC_TYPE_DEFAULT 0
++#define KEXEC_TYPE_CRASH   1
++
++
++/* The kexec implementation for Xen allows the user to load two
++ * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
++ * All data needed for a kexec reboot is kept in one xen_kexec_image_t
++ * per "instance". The data mainly consists of machine address lists to pages
++ * together with destination addresses. The data in xen_kexec_image_t
++ * is passed to the "code page" which is one page of code that performs
++ * the final relocations before jumping to the new kernel.
++ */
++ 
++typedef struct xen_kexec_image {
++#if defined(__i386__) || defined(__x86_64__)
++    unsigned long page_list[KEXEC_XEN_NO_PAGES];
++#endif
++#if defined(__ia64__)
++    unsigned long reboot_code_buffer;
++#endif
++    unsigned long indirection_page;
++    unsigned long start_address;
++} xen_kexec_image_t;
++
++/*
++ * Perform kexec having previously loaded a kexec or kdump kernel
++ * as appropriate.
++ * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
++ */
++#define KEXEC_CMD_kexec                 0
++typedef struct xen_kexec_exec {
++    int type;
++} xen_kexec_exec_t;
++
++/*
++ * Load/Unload kernel image for kexec or kdump.
++ * type  == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
++ * image == relocation information for kexec (ignored for unload) [in]
++ */
++#define KEXEC_CMD_kexec_load            1
++#define KEXEC_CMD_kexec_unload          2
++typedef struct xen_kexec_load {
++    int type;
++    xen_kexec_image_t image;
++} xen_kexec_load_t;
++
++#define KEXEC_RANGE_MA_CRASH      0 /* machine address and size of crash area */
++#define KEXEC_RANGE_MA_XEN        1 /* machine address and size of Xen itself */
++#define KEXEC_RANGE_MA_CPU        2 /* machine address and size of a CPU note */
++#define KEXEC_RANGE_MA_XENHEAP    3 /* machine address and size of xenheap
++                                     * Note that although this is adjacent
++                                     * to Xen it exists in a separate EFI
++                                     * region on ia64, and thus needs to be
++                                     * inserted into iomem_machine separately */
++#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* machine address and size of
++                                     * the ia64_boot_param */
++#define KEXEC_RANGE_MA_EFI_MEMMAP 5 /* machine address and size of
++                                     * of the EFI Memory Map */
++#define KEXEC_RANGE_MA_VMCOREINFO 6 /* machine address and size of vmcoreinfo */
++
++/*
++ * Find the address and size of certain memory areas
++ * range == KEXEC_RANGE_... [in]
++ * nr    == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in]
++ * size  == number of bytes reserved in window [out]
++ * start == address of the first byte in the window [out]
++ */
++#define KEXEC_CMD_kexec_get_range       3
++typedef struct xen_kexec_range {
++    int range;
++    int nr;
++    unsigned long size;
++    unsigned long start;
++} xen_kexec_range_t;
++
++#endif /* _XEN_PUBLIC_KEXEC_H */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/mem_event.h

index 0000000,0000000..93c824b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/mem_event.h
@@@ -1,0 -1,0 +1,86 @@@
++/******************************************************************************
++ * mem_event.h
++ *
++ * Memory event common structures.
++ *
++ * Copyright (c) 2009 by Citrix Systems, Inc. (Patrick Colp)
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef _XEN_PUBLIC_MEM_EVENT_H
++#define _XEN_PUBLIC_MEM_EVENT_H
++
++#include "xen.h"
++#include "io/ring.h"
++
++/* Memory event type */
++#define MEM_EVENT_TYPE_SHARED   0
++#define MEM_EVENT_TYPE_PAGING   1
++#define MEM_EVENT_TYPE_ACCESS   2
++
++/* Memory event flags */
++#define MEM_EVENT_FLAG_VCPU_PAUSED  (1 << 0)
++#define MEM_EVENT_FLAG_DROP_PAGE    (1 << 1)
++
++/* Reasons for the memory event request */
++#define MEM_EVENT_REASON_UNKNOWN     0    /* typical reason */
++#define MEM_EVENT_REASON_VIOLATION   1    /* access violation, GFN is address */
++#define MEM_EVENT_REASON_CR0         2    /* CR0 was hit: gfn is CR0 value */
++#define MEM_EVENT_REASON_CR3         3    /* CR3 was hit: gfn is CR3 value */
++#define MEM_EVENT_REASON_CR4         4    /* CR4 was hit: gfn is CR4 value */
++#define MEM_EVENT_REASON_INT3        5    /* int3 was hit: gla/gfn are RIP */
++
++typedef struct mem_event_shared_page {
++    uint32_t port;
++} mem_event_shared_page_t;
++
++typedef struct mem_event_st {
++    uint16_t type;
++    uint16_t flags;
++    uint32_t vcpu_id;
++
++    uint64_t gfn;
++    uint64_t offset;
++    uint64_t gla; /* if gla_valid */
++
++    uint32_t p2mt;
++
++    uint16_t access_r:1;
++    uint16_t access_w:1;
++    uint16_t access_x:1;
++    uint16_t gla_valid:1;
++    uint16_t available:12;
++
++    uint16_t reason;
++} mem_event_request_t, mem_event_response_t;
++
++DEFINE_RING_TYPES(mem_event, mem_event_request_t, mem_event_response_t);
++
++#endif
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/memory.h

index eac3ce1,eac3ce1..9208576
--- 1/include/xen/interface/memory.h
--- 2/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@@ -3,13 -3,13 +3,31 @@@
    *
    * Memory reservation and information.
    *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
    */
   
   #ifndef __XEN_PUBLIC_MEMORY_H__
   #define __XEN_PUBLIC_MEMORY_H__
   
--#include <linux/spinlock.h>
++#include "xen.h"
   
   /*
    * Increase or decrease the specified domain's memory reservation. Returns a
@@@ -19,6 -19,6 +37,26 @@@
   #define XENMEM_increase_reservation 0
   #define XENMEM_decrease_reservation 1
   #define XENMEM_populate_physmap     6
++
++#if __XEN_INTERFACE_VERSION__ >= 0x00030209
++/*
++ * Maximum # bits addressable by the user of the allocated region (e.g., I/O
++ * devices often have a 32-bit limitation even in 64-bit systems). If zero
++ * then the user has no addressing restriction. This field is not used by
++ * XENMEM_decrease_reservation.
++ */
++#define XENMEMF_address_bits(x)     (x)
++#define XENMEMF_get_address_bits(x) ((x) & 0xffu)
++/* NUMA node to allocate from. */
++#define XENMEMF_node(x)     (((x) + 1) << 8)
++#define XENMEMF_get_node(x) ((((x) >> 8) - 1) & 0xffu)
++/* Flag to populate physmap with populate-on-demand entries */
++#define XENMEMF_populate_on_demand (1<<16)
++/* Flag to request allocation only from the node specified */
++#define XENMEMF_exact_node_request  (1<<17)
++#define XENMEMF_exact_node(n) (XENMEMF_node(n) | XENMEMF_exact_node_request)
++#endif
++
   struct xen_memory_reservation {
   
       /*
@@@ -31,28 -31,28 +69,28 @@@
        *   OUT: GMFN bases of extents that were allocated
        *   (NB. This command also updates the mach_to_phys translation table)
        */
--    GUEST_HANDLE(ulong) extent_start;
++    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
   
       /* Number of extents, and size/alignment of each (2^extent_order pages). */
--    unsigned long  nr_extents;
++    xen_ulong_t    nr_extents;
       unsigned int   extent_order;
   
--    /*
--     * Maximum # bits addressable by the user of the allocated region (e.g.,
--     * I/O devices often have a 32-bit limitation even in 64-bit systems). If
--     * zero then the user has no addressing restriction.
--     * This field is not used by XENMEM_decrease_reservation.
--     */
++#if __XEN_INTERFACE_VERSION__ >= 0x00030209
++    /* XENMEMF flags. */
++    unsigned int   mem_flags;
++#else
       unsigned int   address_bits;
++#endif
   
       /*
        * Domain whose reservation is being changed.
        * Unprivileged domains can specify only DOMID_SELF.
        */
       domid_t        domid;
--
   };
   DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
++typedef struct xen_memory_reservation xen_memory_reservation_t;
++DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
   
   /*
    * An atomic exchange of memory pages. If return code is zero then
@@@ -92,10 -92,10 +130,12 @@@ struct xen_memory_exchange 
        *     command will be non-zero.
        *  5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!
        */
--    unsigned long nr_exchanged;
++    xen_ulong_t nr_exchanged;
   };
--
   DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange);
++typedef struct xen_memory_exchange xen_memory_exchange_t;
++DEFINE_XEN_GUEST_HANDLE(xen_memory_exchange_t);
++
   /*
    * Returns the maximum machine frame number of mapped RAM in this system.
    * This command always succeeds (it never returns an error code).
@@@ -112,6 -112,6 +152,11 @@@
   #define XENMEM_maximum_reservation  4
   
   /*
++ * Returns the maximum GPFN in use by the guest, or -ve errcode on failure.
++ */
++#define XENMEM_maximum_gpfn         14
++
++/*
    * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
    * mapping table. Architectures which do not have a m2p table do not implement
    * this command.
@@@ -130,7 -130,7 +175,7 @@@ struct xen_machphys_mfn_list 
        * any large discontiguities in the machine address space, 2MB gaps in
        * the machphys table will be represented by an MFN base of zero.
        */
--    GUEST_HANDLE(ulong) extent_start;
++    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
   
       /*
        * Number of extents written to the above array. This will be smaller
@@@ -139,6 -139,6 +184,8 @@@
       unsigned int nr_extents;
   };
   DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
++typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
++DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
   
   /*
    * Returns the location in virtual address space of the machine_to_phys
@@@ -148,10 -148,10 +195,12 @@@
    */
   #define XENMEM_machphys_mapping     12
   struct xen_machphys_mapping {
--    unsigned long v_start, v_end; /* Start and end virtual addresses.   */
--    unsigned long max_mfn;        /* Maximum MFN that can be looked up. */
++    xen_ulong_t v_start, v_end; /* Start and end virtual addresses.   */
++    xen_ulong_t max_mfn;        /* Maximum MFN that can be looked up. */
   };
--DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping_t);
++DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping);
++typedef struct xen_machphys_mapping xen_machphys_mapping_t;
++DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t);
   
   /*
    * Sets the GPFN at which a particular page appears in the specified guest's
@@@ -166,38 -166,38 +215,23 @@@ struct xen_add_to_physmap 
       /* Source mapping space. */
   #define XENMAPSPACE_shared_info 0 /* shared info page */
   #define XENMAPSPACE_grant_table 1 /* grant table page */
++#define XENMAPSPACE_gmfn        2 /* GMFN */
       unsigned int space;
   
++#define XENMAPIDX_grant_table_status 0x80000000
++
       /* Index into source mapping space. */
--    unsigned long idx;
++    xen_ulong_t idx;
   
       /* GPFN where the source mapping page should appear. */
--    unsigned long gpfn;
++    xen_pfn_t     gpfn;
   };
   DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
++typedef struct xen_add_to_physmap xen_add_to_physmap_t;
++DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
   
--/*
-- * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
-- * code on failure. This call only works for auto-translated guests.
-- */
--#define XENMEM_translate_gpfn_list  8
--struct xen_translate_gpfn_list {
--    /* Which domain to translate for? */
--    domid_t domid;
--
--    /* Length of list. */
--    unsigned long nr_gpfns;
--
--    /* List of GPFNs to translate. */
--    GUEST_HANDLE(ulong) gpfn_list;
--
--    /*
--     * Output list to contain MFN translations. May be the same as the input
--     * list (in which case each input GPFN is overwritten with the output MFN).
--     */
--    GUEST_HANDLE(ulong) mfn_list;
--};
--DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
++/*** REMOVED ***/
++/*#define XENMEM_translate_gpfn_list  8*/
   
   /*
    * Returns the pseudo-physical memory map as it was when the domain
@@@ -217,9 -217,9 +251,11 @@@ struct xen_memory_map 
        * Entries in the buffer are in the same format as returned by the
        * BIOS INT 0x15 EAX=0xE820 call.
        */
--    GUEST_HANDLE(void) buffer;
++    XEN_GUEST_HANDLE(void) buffer;
   };
   DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
++typedef struct xen_memory_map xen_memory_map_t;
++DEFINE_XEN_GUEST_HANDLE(xen_memory_map_t);
   
   /*
    * Returns the real physical memory map. Passes the same structure as
@@@ -228,10 -228,10 +264,47 @@@
    */
   #define XENMEM_machine_memory_map   10
   
++/*
++ * Set the pseudo-physical memory map of a domain, as returned by
++ * XENMEM_memory_map.
++ * arg == addr of xen_foreign_memory_map_t.
++ */
++#define XENMEM_set_memory_map       13
++struct xen_foreign_memory_map {
++    domid_t domid;
++    struct xen_memory_map map;
++};
++typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
++DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);
++
++#define XENMEM_set_pod_target       16
++#define XENMEM_get_pod_target       17
++struct xen_pod_target {
++    /* IN */
++    uint64_t target_pages;
++    /* OUT */
++    uint64_t tot_pages;
++    uint64_t pod_cache_pages;
++    uint64_t pod_entries;
++    /* IN */
++    domid_t domid;
++};
++typedef struct xen_pod_target xen_pod_target_t;
++
++/*
++ * Get the number of MFNs saved through memory sharing.
++ * The call never fails.
++ */
++#define XENMEM_get_sharing_freed_pages    18
++
++#ifndef CONFIG_XEN
++#include <linux/spinlock.h>
   
   /*
    * Prevent the balloon driver from changing the memory reservation
    * during a driver critical region.
    */
   extern spinlock_t xen_reservation_lock;
++#endif
++
   #endif /* __XEN_PUBLIC_MEMORY_H__ */
diff --cc include/xen/interface/nmi.h

index 0000000,0000000..2fd21d2

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/nmi.h
@@@ -1,0 -1,0 +1,80 @@@
++/******************************************************************************
++ * nmi.h
++ * 
++ * NMI callback registration and reason codes.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
++ */
++
++#ifndef __XEN_PUBLIC_NMI_H__
++#define __XEN_PUBLIC_NMI_H__
++
++#include "xen.h"
++
++/*
++ * NMI reason codes:
++ * Currently these are x86-specific, stored in arch_shared_info.nmi_reason.
++ */
++ /* I/O-check error reported via ISA port 0x61, bit 6. */
++#define _XEN_NMIREASON_io_error     0
++#define XEN_NMIREASON_io_error      (1UL << _XEN_NMIREASON_io_error)
++ /* Parity error reported via ISA port 0x61, bit 7. */
++#define _XEN_NMIREASON_parity_error 1
++#define XEN_NMIREASON_parity_error  (1UL << _XEN_NMIREASON_parity_error)
++ /* Unknown hardware-generated NMI. */
++#define _XEN_NMIREASON_unknown      2
++#define XEN_NMIREASON_unknown       (1UL << _XEN_NMIREASON_unknown)
++
++/*
++ * long nmi_op(unsigned int cmd, void *arg)
++ * NB. All ops return zero on success, else a negative error code.
++ */
++
++/*
++ * Register NMI callback for this (calling) VCPU. Currently this only makes
++ * sense for domain 0, vcpu 0. All other callers will be returned EINVAL.
++ * arg == pointer to xennmi_callback structure.
++ */
++#define XENNMI_register_callback   0
++struct xennmi_callback {
++    unsigned long handler_address;
++    unsigned long pad;
++};
++typedef struct xennmi_callback xennmi_callback_t;
++DEFINE_XEN_GUEST_HANDLE(xennmi_callback_t);
++
++/*
++ * Deregister NMI callback for this (calling) VCPU.
++ * arg == NULL.
++ */
++#define XENNMI_unregister_callback 1
++
++#endif /* __XEN_PUBLIC_NMI_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/physdev.h

index 534cac8,534cac8..f8b6921
--- 1/include/xen/interface/physdev.h
--- 2/include/xen/interface/physdev.h
+++ b/include/xen/interface/physdev.h
@@@ -21,6 -21,6 +21,8 @@@
   #ifndef __XEN_PUBLIC_PHYSDEV_H__
   #define __XEN_PUBLIC_PHYSDEV_H__
   
++#include "xen.h"
++
   /*
    * Prototype for this hypercall is:
    *  int physdev_op(int cmd, void *args)
@@@ -37,6 -37,6 +39,23 @@@ struct physdev_eoi 
         /* IN */
         uint32_t irq;
   };
++typedef struct physdev_eoi physdev_eoi_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_eoi_t);
++
++/*
++ * Register a shared page for the hypervisor to indicate whether the guest
++ * must issue PHYSDEVOP_eoi. The semantics of PHYSDEVOP_eoi change slightly
++ * once the guest used this function in that the associated event channel
++ * will automatically get unmasked. The page registered is used as a bit
++ * array indexed by Xen's PIRQ value.
++ */
++#define PHYSDEVOP_pirq_eoi_gmfn         17
++struct physdev_pirq_eoi_gmfn {
++    /* IN */
++    xen_pfn_t gmfn;
++};
++typedef struct physdev_pirq_eoi_gmfn physdev_pirq_eoi_gmfn_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_pirq_eoi_gmfn_t);
   
   /*
    * Query the status of an IRQ line.
@@@ -49,6 -49,6 +68,8 @@@ struct physdev_irq_status_query 
         /* OUT */
         uint32_t flags; /* XENIRQSTAT_* */
   };
++typedef struct physdev_irq_status_query physdev_irq_status_query_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_irq_status_query_t);
   
   /* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
   #define _XENIRQSTAT_needs_eoi (0)
@@@ -67,6 -67,6 +88,8 @@@ struct physdev_set_iopl 
         /* IN */
         uint32_t iopl;
   };
++typedef struct physdev_set_iopl physdev_set_iopl_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_set_iopl_t);
   
   /*
    * Set the current VCPU's I/O-port permissions bitmap.
@@@ -75,9 -75,9 +98,15 @@@
   #define PHYSDEVOP_set_iobitmap                 7
   struct physdev_set_iobitmap {
         /* IN */
++#if __XEN_INTERFACE_VERSION__ >= 0x00030205
++      XEN_GUEST_HANDLE(uint8) bitmap;
++#else
         uint8_t * bitmap;
++#endif
         uint32_t nr_ports;
   };
++typedef struct physdev_set_iobitmap physdev_set_iobitmap_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_set_iobitmap_t);
   
   /*
    * Read or write an IO-APIC register.
@@@ -92,6 -92,6 +121,8 @@@ struct physdev_apic 
         /* IN or OUT */
         uint32_t value;
   };
++typedef struct physdev_apic physdev_apic_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_apic_t);
   
   /*
    * Allocate or free a physical upcall vector for the specified IRQ line.
@@@ -105,6 -105,6 +136,8 @@@ struct physdev_irq 
         /* IN or OUT */
         uint32_t vector;
   };
++typedef struct physdev_irq physdev_irq_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
   
   #define MAP_PIRQ_TYPE_MSI             0x0
   #define MAP_PIRQ_TYPE_GSI             0x1
@@@ -128,6 -128,6 +161,8 @@@ struct physdev_map_pirq 
       /* IN */
       uint64_t table_base;
   };
++typedef struct physdev_map_pirq physdev_map_pirq_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_map_pirq_t);
   
   #define PHYSDEVOP_unmap_pirq          14
   struct physdev_unmap_pirq {
@@@ -135,6 -135,6 +170,8 @@@
       /* IN */
       int pirq;
   };
++typedef struct physdev_unmap_pirq physdev_unmap_pirq_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_unmap_pirq_t);
   
   #define PHYSDEVOP_manage_pci_add      15
   #define PHYSDEVOP_manage_pci_remove   16
@@@ -143,6 -143,6 +180,17 @@@ struct physdev_manage_pci 
         uint8_t bus;
         uint8_t devfn;
   };
++typedef struct physdev_manage_pci physdev_manage_pci_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_t);
++
++#define PHYSDEVOP_restore_msi            19
++struct physdev_restore_msi {
++    /* IN */
++    uint8_t bus;
++    uint8_t devfn;
++};
++typedef struct physdev_restore_msi physdev_restore_msi_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_restore_msi_t);
   
   #define PHYSDEVOP_manage_pci_add_ext  20
   struct physdev_manage_pci_ext {
@@@ -156,6 -156,6 +204,8 @@@
                 uint8_t devfn;
         } physfn;
   };
++typedef struct physdev_manage_pci_ext physdev_manage_pci_ext_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_ext_t);
   
   /*
    * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
@@@ -171,6 -171,6 +221,8 @@@ struct physdev_op 
                 struct physdev_irq                   irq_op;
         } u;
   };
++typedef struct physdev_op physdev_op_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_op_t);
   
   #define PHYSDEVOP_setup_gsi    21
   struct physdev_setup_gsi {
@@@ -181,12 -181,12 +233,10 @@@
       uint8_t polarity;
       /* IN */
   };
++typedef struct physdev_setup_gsi physdev_setup_gsi_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_setup_gsi_t);
   
--#define PHYSDEVOP_get_nr_pirqs    22
--struct physdev_nr_pirqs {
--    /* OUT */
--    uint32_t nr_pirqs;
--};
++/* leave PHYSDEVOP 22 free */
   
   /* type is MAP_PIRQ_TYPE_GSI or MAP_PIRQ_TYPE_MSI
    * the hypercall returns a free pirq */
@@@ -198,6 -198,6 +248,9 @@@ struct physdev_get_free_pirq 
       uint32_t pirq;
   };
   
++typedef struct physdev_get_free_pirq physdev_get_free_pirq_t;
++DEFINE_XEN_GUEST_HANDLE(physdev_get_free_pirq_t);
++
   /*
    * Notify that some PIRQ-bound event channels have been unmasked.
    * ** This command is obsolete since interface version 0x00030202 and is **
diff --cc include/xen/interface/platform.h

index 0000000,0000000..1a519f4

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/platform.h
@@@ -1,0 -1,0 +1,402 @@@
++/******************************************************************************
++ * platform.h
++ * 
++ * Hardware platform operations. Intended for use by domain-0 kernel.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2002-2006, K Fraser
++ */
++
++#ifndef __XEN_PUBLIC_PLATFORM_H__
++#define __XEN_PUBLIC_PLATFORM_H__
++
++#include "xen.h"
++
++#define XENPF_INTERFACE_VERSION 0x03000001
++
++/*
++ * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
++ * 1 January, 1970 if the current system time was <system_time>.
++ */
++#define XENPF_settime             17
++struct xenpf_settime {
++    /* IN variables. */
++    uint32_t secs;
++    uint32_t nsecs;
++    uint64_t system_time;
++};
++typedef struct xenpf_settime xenpf_settime_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t);
++
++/*
++ * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
++ * On x86, @type is an architecture-defined MTRR memory type.
++ * On success, returns the MTRR that was used (@reg) and a handle that can
++ * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting.
++ * (x86-specific).
++ */
++#define XENPF_add_memtype         31
++struct xenpf_add_memtype {
++    /* IN variables. */
++    xen_pfn_t mfn;
++    uint64_t nr_mfns;
++    uint32_t type;
++    /* OUT variables. */
++    uint32_t handle;
++    uint32_t reg;
++};
++typedef struct xenpf_add_memtype xenpf_add_memtype_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_add_memtype_t);
++
++/*
++ * Tear down an existing memory-range type. If @handle is remembered then it
++ * should be passed in to accurately tear down the correct setting (in case
++ * of overlapping memory regions with differing types). If it is not known
++ * then @handle should be set to zero. In all cases @reg must be set.
++ * (x86-specific).
++ */
++#define XENPF_del_memtype         32
++struct xenpf_del_memtype {
++    /* IN variables. */
++    uint32_t handle;
++    uint32_t reg;
++};
++typedef struct xenpf_del_memtype xenpf_del_memtype_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_del_memtype_t);
++
++/* Read current type of an MTRR (x86-specific). */
++#define XENPF_read_memtype        33
++struct xenpf_read_memtype {
++    /* IN variables. */
++    uint32_t reg;
++    /* OUT variables. */
++    xen_pfn_t mfn;
++    uint64_t nr_mfns;
++    uint32_t type;
++};
++typedef struct xenpf_read_memtype xenpf_read_memtype_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_read_memtype_t);
++
++#define XENPF_microcode_update    35
++struct xenpf_microcode_update {
++    /* IN variables. */
++    XEN_GUEST_HANDLE(const_void) data;/* Pointer to microcode data */
++    uint32_t length;                  /* Length of microcode data. */
++};
++typedef struct xenpf_microcode_update xenpf_microcode_update_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_microcode_update_t);
++
++#define XENPF_platform_quirk      39
++#define QUIRK_NOIRQBALANCING      1 /* Do not restrict IO-APIC RTE targets */
++#define QUIRK_IOAPIC_BAD_REGSEL   2 /* IO-APIC REGSEL forgets its value    */
++#define QUIRK_IOAPIC_GOOD_REGSEL  3 /* IO-APIC REGSEL behaves properly     */
++struct xenpf_platform_quirk {
++    /* IN variables. */
++    uint32_t quirk_id;
++};
++typedef struct xenpf_platform_quirk xenpf_platform_quirk_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_platform_quirk_t);
++
++#define XENPF_firmware_info       50
++#define XEN_FW_DISK_INFO          1 /* from int 13 AH=08/41/48 */
++#define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */
++#define XEN_FW_VBEDDC_INFO        3 /* from int 10 AX=4f15 */
++struct xenpf_firmware_info {
++    /* IN variables. */
++    uint32_t type;
++    uint32_t index;
++    /* OUT variables. */
++    union {
++        struct {
++            /* Int13, Fn48: Check Extensions Present. */
++            uint8_t device;                   /* %dl: bios device number */
++            uint8_t version;                  /* %ah: major version      */
++            uint16_t interface_support;       /* %cx: support bitmap     */
++            /* Int13, Fn08: Legacy Get Device Parameters. */
++            uint16_t legacy_max_cylinder;     /* %cl[7:6]:%ch: max cyl # */
++            uint8_t legacy_max_head;          /* %dh: max head #         */
++            uint8_t legacy_sectors_per_track; /* %cl[5:0]: max sector #  */
++            /* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */
++            /* NB. First uint16_t of buffer must be set to buffer size.      */
++            XEN_GUEST_HANDLE(void) edd_params;
++        } disk_info; /* XEN_FW_DISK_INFO */
++        struct {
++            uint8_t device;                   /* bios device number  */
++            uint32_t mbr_signature;           /* offset 0x1b8 in mbr */
++        } disk_mbr_signature; /* XEN_FW_DISK_MBR_SIGNATURE */
++        struct {
++            /* Int10, AX=4F15: Get EDID info. */
++            uint8_t capabilities;
++            uint8_t edid_transfer_time;
++            /* must refer to 128-byte buffer */
++            XEN_GUEST_HANDLE(uint8) edid;
++        } vbeddc_info; /* XEN_FW_VBEDDC_INFO */
++    } u;
++};
++typedef struct xenpf_firmware_info xenpf_firmware_info_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_firmware_info_t);
++
++#define XENPF_enter_acpi_sleep    51
++struct xenpf_enter_acpi_sleep {
++    /* IN variables */
++    uint16_t pm1a_cnt_val;      /* PM1a control value. */
++    uint16_t pm1b_cnt_val;      /* PM1b control value. */
++    uint32_t sleep_state;       /* Which state to enter (Sn). */
++    uint32_t flags;             /* Must be zero. */
++};
++typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_enter_acpi_sleep_t);
++
++#define XENPF_change_freq         52
++struct xenpf_change_freq {
++    /* IN variables */
++    uint32_t flags; /* Must be zero. */
++    uint32_t cpu;   /* Physical cpu. */
++    uint64_t freq;  /* New frequency (Hz). */
++};
++typedef struct xenpf_change_freq xenpf_change_freq_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_change_freq_t);
++
++/*
++ * Get idle times (nanoseconds since boot) for physical CPUs specified in the
++ * @cpumap_bitmap with range [0..@cpumap_nr_cpus-1]. The @idletime array is
++ * indexed by CPU number; only entries with the corresponding @cpumap_bitmap
++ * bit set are written to. On return, @cpumap_bitmap is modified so that any
++ * non-existent CPUs are cleared. Such CPUs have their @idletime array entry
++ * cleared.
++ */
++#define XENPF_getidletime         53
++struct xenpf_getidletime {
++    /* IN/OUT variables */
++    /* IN: CPUs to interrogate; OUT: subset of IN which are present */
++    XEN_GUEST_HANDLE(uint8) cpumap_bitmap;
++    /* IN variables */
++    /* Size of cpumap bitmap. */
++    uint32_t cpumap_nr_cpus;
++    /* Must be indexable for every cpu in cpumap_bitmap. */
++    XEN_GUEST_HANDLE(uint64) idletime;
++    /* OUT variables */
++    /* System time when the idletime snapshots were taken. */
++    uint64_t now;
++};
++typedef struct xenpf_getidletime xenpf_getidletime_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t);
++
++#define XENPF_set_processor_pminfo      54
++
++/* ability bits */
++#define XEN_PROCESSOR_PM_CX   1
++#define XEN_PROCESSOR_PM_PX   2
++#define XEN_PROCESSOR_PM_TX   4
++
++/* cmd type */
++#define XEN_PM_CX   0
++#define XEN_PM_PX   1
++#define XEN_PM_TX   2
++
++/* Px sub info type */
++#define XEN_PX_PCT   1
++#define XEN_PX_PSS   2
++#define XEN_PX_PPC   4
++#define XEN_PX_PSD   8
++
++struct xen_power_register {
++    uint32_t     space_id;
++    uint32_t     bit_width;
++    uint32_t     bit_offset;
++    uint32_t     access_size;
++    uint64_t     address;
++};
++
++struct xen_processor_csd {
++    uint32_t    domain;      /* domain number of one dependent group */
++    uint32_t    coord_type;  /* coordination type */
++    uint32_t    num;         /* number of processors in same domain */
++};
++typedef struct xen_processor_csd xen_processor_csd_t;
++DEFINE_XEN_GUEST_HANDLE(xen_processor_csd_t);
++
++struct xen_processor_cx {
++    struct xen_power_register  reg; /* GAS for Cx trigger register */
++    uint8_t     type;     /* cstate value, c0: 0, c1: 1, ... */
++    uint32_t    latency;  /* worst latency (ms) to enter/exit this cstate */
++    uint32_t    power;    /* average power consumption(mW) */
++    uint32_t    dpcnt;    /* number of dependency entries */
++    XEN_GUEST_HANDLE(xen_processor_csd_t) dp; /* NULL if no dependency */
++};
++typedef struct xen_processor_cx xen_processor_cx_t;
++DEFINE_XEN_GUEST_HANDLE(xen_processor_cx_t);
++
++struct xen_processor_flags {
++    uint32_t bm_control:1;
++    uint32_t bm_check:1;
++    uint32_t has_cst:1;
++    uint32_t power_setup_done:1;
++    uint32_t bm_rld_set:1;
++};
++
++struct xen_processor_power {
++    uint32_t count;  /* number of C state entries in array below */
++    struct xen_processor_flags flags;  /* global flags of this processor */
++    XEN_GUEST_HANDLE(xen_processor_cx_t) states; /* supported c states */
++};
++
++struct xen_pct_register {
++    uint8_t  descriptor;
++    uint16_t length;
++    uint8_t  space_id;
++    uint8_t  bit_width;
++    uint8_t  bit_offset;
++    uint8_t  reserved;
++    uint64_t address;
++};
++
++struct xen_processor_px {
++    uint64_t core_frequency; /* megahertz */
++    uint64_t power;      /* milliWatts */
++    uint64_t transition_latency; /* microseconds */
++    uint64_t bus_master_latency; /* microseconds */
++    uint64_t control;        /* control value */
++    uint64_t status;     /* success indicator */
++};
++typedef struct xen_processor_px xen_processor_px_t;
++DEFINE_XEN_GUEST_HANDLE(xen_processor_px_t);
++
++struct xen_psd_package {
++    uint64_t num_entries;
++    uint64_t revision;
++    uint64_t domain;
++    uint64_t coord_type;
++    uint64_t num_processors;
++};
++
++struct xen_processor_performance {
++    uint32_t flags;     /* flag for Px sub info type */
++    uint32_t platform_limit;  /* Platform limitation on freq usage */
++    struct xen_pct_register control_register;
++    struct xen_pct_register status_register;
++    uint32_t state_count;     /* total available performance states */
++    XEN_GUEST_HANDLE(xen_processor_px_t) states;
++    struct xen_psd_package domain_info;
++    uint32_t shared_type;     /* coordination type of this processor */
++};
++typedef struct xen_processor_performance xen_processor_performance_t;
++DEFINE_XEN_GUEST_HANDLE(xen_processor_performance_t);
++
++struct xenpf_set_processor_pminfo {
++    /* IN variables */
++    uint32_t id;    /* ACPI CPU ID */
++    uint32_t type;  /* {XEN_PM_CX, XEN_PM_PX} */
++    union {
++        struct xen_processor_power          power;/* Cx: _CST/_CSD */
++        struct xen_processor_performance    perf; /* Px: _PPC/_PCT/_PSS/_PSD */
++    } u;
++};
++typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_set_processor_pminfo_t);
++
++#define XENPF_get_cpuinfo 55
++struct xenpf_pcpuinfo {
++    /* IN */
++    uint32_t xen_cpuid;
++    /* OUT */
++    /* The maxium cpu_id that is present */
++    uint32_t max_present;
++#define XEN_PCPU_FLAGS_ONLINE   1
++    /* Correponding xen_cpuid is not present*/
++#define XEN_PCPU_FLAGS_INVALID  2
++    uint32_t flags;
++    uint32_t apic_id;
++    uint32_t acpi_id;
++};
++typedef struct xenpf_pcpuinfo xenpf_pcpuinfo_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_pcpuinfo_t);
++
++#define XENPF_cpu_online    56
++#define XENPF_cpu_offline   57
++struct xenpf_cpu_ol
++{
++    uint32_t cpuid;
++};
++typedef struct xenpf_cpu_ol xenpf_cpu_ol_t;
++DEFINE_XEN_GUEST_HANDLE(xenpf_cpu_ol_t);
++
++#define XENPF_cpu_hotadd    58
++struct xenpf_cpu_hotadd
++{
++      uint32_t apic_id;
++      uint32_t acpi_id;
++      uint32_t pxm;
++};
++
++#define XENPF_mem_hotadd    59
++struct xenpf_mem_hotadd
++{
++    uint64_t spfn;
++    uint64_t epfn;
++    uint32_t pxm;
++    uint32_t flags;
++};
++
++#define XENPF_get_cpu_freq        ('N' << 24)
++struct xenpf_get_cpu_freq {
++    /* IN variables */
++    uint32_t vcpu;
++    /* OUT variables */
++    uint32_t freq; /* in kHz */
++};
++
++struct xen_platform_op {
++    uint32_t cmd;
++    uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
++    union {
++        struct xenpf_settime           settime;
++        struct xenpf_add_memtype       add_memtype;
++        struct xenpf_del_memtype       del_memtype;
++        struct xenpf_read_memtype      read_memtype;
++        struct xenpf_microcode_update  microcode;
++        struct xenpf_platform_quirk    platform_quirk;
++        struct xenpf_firmware_info     firmware_info;
++        struct xenpf_enter_acpi_sleep  enter_acpi_sleep;
++        struct xenpf_change_freq       change_freq;
++        struct xenpf_getidletime       getidletime;
++        struct xenpf_set_processor_pminfo set_pminfo;
++        struct xenpf_pcpuinfo          pcpu_info;
++        struct xenpf_cpu_ol            cpu_ol;
++        struct xenpf_cpu_hotadd        cpu_add;
++        struct xenpf_mem_hotadd        mem_add;
++        struct xenpf_get_cpu_freq      get_cpu_freq;
++        uint8_t                        pad[128];
++    } u;
++};
++typedef struct xen_platform_op xen_platform_op_t;
++DEFINE_XEN_GUEST_HANDLE(xen_platform_op_t);
++
++#endif /* __XEN_PUBLIC_PLATFORM_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/sched.h

index dd55dac,dd55dac..41ec624
--- 1/include/xen/interface/sched.h
--- 2/include/xen/interface/sched.h
+++ b/include/xen/interface/sched.h
@@@ -3,6 -3,6 +3,24 @@@
    *
    * Scheduler state interactions
    *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
    */
   
@@@ -13,17 -13,17 +31,17 @@@
   
   /*
    * The prototype for this hypercall is:
-- *  long sched_op_new(int cmd, void *arg)
++ *  long sched_op(int cmd, void *arg)
    * @cmd == SCHEDOP_??? (scheduler operation).
    * @arg == Operation-specific extra argument(s), as described below.
    *
-- * **NOTE**:
-- * Versions of Xen prior to 3.0.2 provide only the following legacy version
++ * Versions of Xen prior to 3.0.2 provided only the following legacy version
    * of this hypercall, supporting only the commands yield, block and shutdown:
    *  long sched_op(int cmd, unsigned long arg)
    * @cmd == SCHEDOP_??? (scheduler operation).
    * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
    *      == SHUTDOWN_* code (SCHEDOP_shutdown)
++ * This legacy version is available to new guests as sched_op_compat().
    */
   
   /*
@@@ -50,6 -50,6 +68,8 @@@ struct sched_shutdown 
       unsigned int reason; /* SHUTDOWN_* */
   };
   DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
++typedef struct sched_shutdown sched_shutdown_t;
++DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t);
   
   /*
    * Poll a set of event-channel ports. Return when one or more are pending. An
@@@ -58,11 -58,11 +78,13 @@@
    */
   #define SCHEDOP_poll        3
   struct sched_poll {
--    GUEST_HANDLE(evtchn_port_t) ports;
++    XEN_GUEST_HANDLE(evtchn_port_t) ports;
       unsigned int nr_ports;
       uint64_t timeout;
   };
   DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
++typedef struct sched_poll sched_poll_t;
++DEFINE_XEN_GUEST_HANDLE(sched_poll_t);
   
   /*
    * Declare a shutdown for another domain. The main use of this function is
@@@ -75,6 -75,6 +97,8 @@@ struct sched_remote_shutdown 
       domid_t domain_id;         /* Remote domain ID */
       unsigned int reason;       /* SHUTDOWN_xxx reason */
   };
++typedef struct sched_remote_shutdown sched_remote_shutdown_t;
++DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t);
   
   /*
    * Latch a shutdown code, so that when the domain later shuts down it
@@@ -96,6 -96,6 +120,8 @@@ struct sched_watchdog 
       uint32_t id;                /* watchdog ID */
       uint32_t timeout;           /* timeout */
   };
++typedef struct sched_watchdog sched_watchdog_t;
++DEFINE_XEN_GUEST_HANDLE(sched_watchdog_t);
   
   /*
    * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
diff --cc include/xen/interface/sysctl.h

index 0000000,0000000..c10a85d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/sysctl.h
@@@ -1,0 -1,0 +1,637 @@@
++/******************************************************************************
++ * sysctl.h
++ * 
++ * System management operations. For use by node control stack.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2002-2006, K Fraser
++ */
++
++#ifndef __XEN_PUBLIC_SYSCTL_H__
++#define __XEN_PUBLIC_SYSCTL_H__
++
++#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
++#error "sysctl operations are intended for use by node control tools only"
++#endif
++
++#include "xen.h"
++#include "domctl.h"
++
++#define XEN_SYSCTL_INTERFACE_VERSION 0x00000008
++
++/*
++ * Read console content from Xen buffer ring.
++ */
++/* XEN_SYSCTL_readconsole */
++struct xen_sysctl_readconsole {
++    /* IN: Non-zero -> clear after reading. */
++    uint8_t clear;
++    /* IN: Non-zero -> start index specified by @index field. */
++    uint8_t incremental;
++    uint8_t pad0, pad1;
++    /*
++     * IN:  Start index for consuming from ring buffer (if @incremental);
++     * OUT: End index after consuming from ring buffer.
++     */
++    uint32_t index; 
++    /* IN: Virtual address to write console data. */
++    XEN_GUEST_HANDLE_64(char) buffer;
++    /* IN: Size of buffer; OUT: Bytes written to buffer. */
++    uint32_t count;
++};
++typedef struct xen_sysctl_readconsole xen_sysctl_readconsole_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_readconsole_t);
++
++/* Get trace buffers machine base address */
++/* XEN_SYSCTL_tbuf_op */
++struct xen_sysctl_tbuf_op {
++    /* IN variables */
++#define XEN_SYSCTL_TBUFOP_get_info     0
++#define XEN_SYSCTL_TBUFOP_set_cpu_mask 1
++#define XEN_SYSCTL_TBUFOP_set_evt_mask 2
++#define XEN_SYSCTL_TBUFOP_set_size     3
++#define XEN_SYSCTL_TBUFOP_enable       4
++#define XEN_SYSCTL_TBUFOP_disable      5
++    uint32_t cmd;
++    /* IN/OUT variables */
++    struct xenctl_cpumap cpu_mask;
++    uint32_t             evt_mask;
++    /* OUT variables */
++    uint64_aligned_t buffer_mfn;
++    uint32_t size;  /* Also an IN variable! */
++};
++typedef struct xen_sysctl_tbuf_op xen_sysctl_tbuf_op_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_op_t);
++
++/*
++ * Get physical information about the host machine
++ */
++/* XEN_SYSCTL_physinfo */
++ /* (x86) The platform supports HVM guests. */
++#define _XEN_SYSCTL_PHYSCAP_hvm          0
++#define XEN_SYSCTL_PHYSCAP_hvm           (1u<<_XEN_SYSCTL_PHYSCAP_hvm)
++ /* (x86) The platform supports HVM-guest direct access to I/O devices. */
++#define _XEN_SYSCTL_PHYSCAP_hvm_directio 1
++#define XEN_SYSCTL_PHYSCAP_hvm_directio  (1u<<_XEN_SYSCTL_PHYSCAP_hvm_directio)
++struct xen_sysctl_physinfo {
++    uint32_t threads_per_core;
++    uint32_t cores_per_socket;
++    uint32_t nr_cpus;     /* # CPUs currently online */
++    uint32_t max_cpu_id;  /* Largest possible CPU ID on this host */
++    uint32_t nr_nodes;    /* # nodes currently online */
++    uint32_t max_node_id; /* Largest possible node ID on this host */
++    uint32_t cpu_khz;
++    uint64_aligned_t total_pages;
++    uint64_aligned_t free_pages;
++    uint64_aligned_t scrub_pages;
++    uint32_t hw_cap[8];
++
++    /* XEN_SYSCTL_PHYSCAP_??? */
++    uint32_t capabilities;
++};
++typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t);
++
++/*
++ * Get the ID of the current scheduler.
++ */
++/* XEN_SYSCTL_sched_id */
++struct xen_sysctl_sched_id {
++    /* OUT variable */
++    uint32_t sched_id;
++};
++typedef struct xen_sysctl_sched_id xen_sysctl_sched_id_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_sched_id_t);
++
++/* Interface for controlling Xen software performance counters. */
++/* XEN_SYSCTL_perfc_op */
++/* Sub-operations: */
++#define XEN_SYSCTL_PERFCOP_reset 1   /* Reset all counters to zero. */
++#define XEN_SYSCTL_PERFCOP_query 2   /* Get perfctr information. */
++struct xen_sysctl_perfc_desc {
++    char         name[80];             /* name of perf counter */
++    uint32_t     nr_vals;              /* number of values for this counter */
++};
++typedef struct xen_sysctl_perfc_desc xen_sysctl_perfc_desc_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t);
++typedef uint32_t xen_sysctl_perfc_val_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t);
++
++struct xen_sysctl_perfc_op {
++    /* IN variables. */
++    uint32_t       cmd;                /*  XEN_SYSCTL_PERFCOP_??? */
++    /* OUT variables. */
++    uint32_t       nr_counters;       /*  number of counters description  */
++    uint32_t       nr_vals;           /*  number of values  */
++    /* counter information (or NULL) */
++    XEN_GUEST_HANDLE_64(xen_sysctl_perfc_desc_t) desc;
++    /* counter values (or NULL) */
++    XEN_GUEST_HANDLE_64(xen_sysctl_perfc_val_t) val;
++};
++typedef struct xen_sysctl_perfc_op xen_sysctl_perfc_op_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_op_t);
++
++/* XEN_SYSCTL_getdomaininfolist */
++struct xen_sysctl_getdomaininfolist {
++    /* IN variables. */
++    domid_t               first_domain;
++    uint32_t              max_domains;
++    XEN_GUEST_HANDLE_64(xen_domctl_getdomaininfo_t) buffer;
++    /* OUT variables. */
++    uint32_t              num_domains;
++};
++typedef struct xen_sysctl_getdomaininfolist xen_sysctl_getdomaininfolist_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getdomaininfolist_t);
++
++/* Inject debug keys into Xen. */
++/* XEN_SYSCTL_debug_keys */
++struct xen_sysctl_debug_keys {
++    /* IN variables. */
++    XEN_GUEST_HANDLE_64(char) keys;
++    uint32_t nr_keys;
++};
++typedef struct xen_sysctl_debug_keys xen_sysctl_debug_keys_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_debug_keys_t);
++
++/* Get physical CPU information. */
++/* XEN_SYSCTL_getcpuinfo */
++struct xen_sysctl_cpuinfo {
++    uint64_aligned_t idletime;
++};
++typedef struct xen_sysctl_cpuinfo xen_sysctl_cpuinfo_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpuinfo_t); 
++struct xen_sysctl_getcpuinfo {
++    /* IN variables. */
++    uint32_t max_cpus;
++    XEN_GUEST_HANDLE_64(xen_sysctl_cpuinfo_t) info;
++    /* OUT variables. */
++    uint32_t nr_cpus;
++}; 
++typedef struct xen_sysctl_getcpuinfo xen_sysctl_getcpuinfo_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getcpuinfo_t); 
++
++/* XEN_SYSCTL_availheap */
++struct xen_sysctl_availheap {
++    /* IN variables. */
++    uint32_t min_bitwidth;  /* Smallest address width (zero if don't care). */
++    uint32_t max_bitwidth;  /* Largest address width (zero if don't care). */
++    int32_t  node;          /* NUMA node of interest (-1 for all nodes). */
++    /* OUT variables. */
++    uint64_aligned_t avail_bytes;/* Bytes available in the specified region. */
++};
++typedef struct xen_sysctl_availheap xen_sysctl_availheap_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_availheap_t);
++
++/* XEN_SYSCTL_get_pmstat */
++struct pm_px_val {
++    uint64_aligned_t freq;        /* Px core frequency */
++    uint64_aligned_t residency;   /* Px residency time */
++    uint64_aligned_t count;       /* Px transition count */
++};
++typedef struct pm_px_val pm_px_val_t;
++DEFINE_XEN_GUEST_HANDLE(pm_px_val_t);
++
++struct pm_px_stat {
++    uint8_t total;        /* total Px states */
++    uint8_t usable;       /* usable Px states */
++    uint8_t last;         /* last Px state */
++    uint8_t cur;          /* current Px state */
++    XEN_GUEST_HANDLE_64(uint64) trans_pt;   /* Px transition table */
++    XEN_GUEST_HANDLE_64(pm_px_val_t) pt;
++};
++typedef struct pm_px_stat pm_px_stat_t;
++DEFINE_XEN_GUEST_HANDLE(pm_px_stat_t);
++
++struct pm_cx_stat {
++    uint32_t nr;    /* entry nr in triggers & residencies, including C0 */
++    uint32_t last;  /* last Cx state */
++    uint64_aligned_t idle_time;                 /* idle time from boot */
++    XEN_GUEST_HANDLE_64(uint64) triggers;    /* Cx trigger counts */
++    XEN_GUEST_HANDLE_64(uint64) residencies; /* Cx residencies */
++    uint64_aligned_t pc3;
++    uint64_aligned_t pc6;
++    uint64_aligned_t pc7;
++    uint64_aligned_t cc3;
++    uint64_aligned_t cc6;
++};
++
++struct xen_sysctl_get_pmstat {
++#define PMSTAT_CATEGORY_MASK 0xf0
++#define PMSTAT_PX            0x10
++#define PMSTAT_CX            0x20
++#define PMSTAT_get_max_px    (PMSTAT_PX | 0x1)
++#define PMSTAT_get_pxstat    (PMSTAT_PX | 0x2)
++#define PMSTAT_reset_pxstat  (PMSTAT_PX | 0x3)
++#define PMSTAT_get_max_cx    (PMSTAT_CX | 0x1)
++#define PMSTAT_get_cxstat    (PMSTAT_CX | 0x2)
++#define PMSTAT_reset_cxstat  (PMSTAT_CX | 0x3)
++    uint32_t type;
++    uint32_t cpuid;
++    union {
++        struct pm_px_stat getpx;
++        struct pm_cx_stat getcx;
++        /* other struct for tx, etc */
++    } u;
++};
++typedef struct xen_sysctl_get_pmstat xen_sysctl_get_pmstat_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_get_pmstat_t);
++
++/* XEN_SYSCTL_cpu_hotplug */
++struct xen_sysctl_cpu_hotplug {
++    /* IN variables */
++    uint32_t cpu;   /* Physical cpu. */
++#define XEN_SYSCTL_CPU_HOTPLUG_ONLINE  0
++#define XEN_SYSCTL_CPU_HOTPLUG_OFFLINE 1
++    uint32_t op;    /* hotplug opcode */
++};
++typedef struct xen_sysctl_cpu_hotplug xen_sysctl_cpu_hotplug_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpu_hotplug_t);
++
++/*
++ * Get/set xen power management, include 
++ * 1. cpufreq governors and related parameters
++ */
++/* XEN_SYSCTL_pm_op */
++struct xen_userspace {
++    uint32_t scaling_setspeed;
++};
++typedef struct xen_userspace xen_userspace_t;
++
++struct xen_ondemand {
++    uint32_t sampling_rate_max;
++    uint32_t sampling_rate_min;
++
++    uint32_t sampling_rate;
++    uint32_t up_threshold;
++};
++typedef struct xen_ondemand xen_ondemand_t;
++
++/* 
++ * cpufreq para name of this structure named 
++ * same as sysfs file name of native linux
++ */
++#define CPUFREQ_NAME_LEN 16
++struct xen_get_cpufreq_para {
++    /* IN/OUT variable */
++    uint32_t cpu_num;
++    uint32_t freq_num;
++    uint32_t gov_num;
++
++    /* for all governors */
++    /* OUT variable */
++    XEN_GUEST_HANDLE_64(uint32) affected_cpus;
++    XEN_GUEST_HANDLE_64(uint32) scaling_available_frequencies;
++    XEN_GUEST_HANDLE_64(char)   scaling_available_governors;
++    char scaling_driver[CPUFREQ_NAME_LEN];
++
++    uint32_t cpuinfo_cur_freq;
++    uint32_t cpuinfo_max_freq;
++    uint32_t cpuinfo_min_freq;
++    uint32_t scaling_cur_freq;
++
++    char scaling_governor[CPUFREQ_NAME_LEN];
++    uint32_t scaling_max_freq;
++    uint32_t scaling_min_freq;
++
++    /* for specific governor */
++    union {
++        struct  xen_userspace userspace;
++        struct  xen_ondemand ondemand;
++    } u;
++
++    int32_t turbo_enabled;
++};
++
++struct xen_set_cpufreq_gov {
++    char scaling_governor[CPUFREQ_NAME_LEN];
++};
++
++struct xen_set_cpufreq_para {
++    #define SCALING_MAX_FREQ           1
++    #define SCALING_MIN_FREQ           2
++    #define SCALING_SETSPEED           3
++    #define SAMPLING_RATE              4
++    #define UP_THRESHOLD               5
++
++    uint32_t ctrl_type;
++    uint32_t ctrl_value;
++};
++
++struct xen_sysctl_pm_op {
++    #define PM_PARA_CATEGORY_MASK      0xf0
++    #define CPUFREQ_PARA               0x10
++
++    /* cpufreq command type */
++    #define GET_CPUFREQ_PARA           (CPUFREQ_PARA | 0x01)
++    #define SET_CPUFREQ_GOV            (CPUFREQ_PARA | 0x02)
++    #define SET_CPUFREQ_PARA           (CPUFREQ_PARA | 0x03)
++    #define GET_CPUFREQ_AVGFREQ        (CPUFREQ_PARA | 0x04)
++
++    /* set/reset scheduler power saving option */
++    #define XEN_SYSCTL_pm_op_set_sched_opt_smt    0x21
++
++    /* cpuidle max_cstate access command */
++    #define XEN_SYSCTL_pm_op_get_max_cstate       0x22
++    #define XEN_SYSCTL_pm_op_set_max_cstate       0x23
++
++    /* set scheduler migration cost value */
++    #define XEN_SYSCTL_pm_op_set_vcpu_migration_delay   0x24
++    #define XEN_SYSCTL_pm_op_get_vcpu_migration_delay   0x25
++
++    /* enable/disable turbo mode when in dbs governor */
++    #define XEN_SYSCTL_pm_op_enable_turbo               0x26
++    #define XEN_SYSCTL_pm_op_disable_turbo              0x27
++
++    uint32_t cmd;
++    uint32_t cpuid;
++    union {
++        struct xen_get_cpufreq_para get_para;
++        struct xen_set_cpufreq_gov  set_gov;
++        struct xen_set_cpufreq_para set_para;
++        uint64_aligned_t get_avgfreq;
++        uint32_t                    set_sched_opt_smt;
++        uint32_t                    get_max_cstate;
++        uint32_t                    set_max_cstate;
++        uint32_t                    get_vcpu_migration_delay;
++        uint32_t                    set_vcpu_migration_delay;
++    } u;
++};
++
++/* XEN_SYSCTL_page_offline_op */
++struct xen_sysctl_page_offline_op {
++    /* IN: range of page to be offlined */
++#define sysctl_page_offline     1
++#define sysctl_page_online      2
++#define sysctl_query_page_offline  3
++    uint32_t cmd;
++    uint32_t start;
++    uint32_t end;
++    /* OUT: result of page offline request */
++    /*
++     * bit 0~15: result flags
++     * bit 16~31: owner
++     */
++    XEN_GUEST_HANDLE(uint32) status;
++};
++
++#define PG_OFFLINE_STATUS_MASK    (0xFFUL)
++
++/* The result is invalid, i.e. HV does not handle it */
++#define PG_OFFLINE_INVALID   (0x1UL << 0)
++
++#define PG_OFFLINE_OFFLINED  (0x1UL << 1)
++#define PG_OFFLINE_PENDING   (0x1UL << 2)
++#define PG_OFFLINE_FAILED    (0x1UL << 3)
++
++#define PG_ONLINE_FAILED     PG_OFFLINE_FAILED
++#define PG_ONLINE_ONLINED    PG_OFFLINE_OFFLINED
++
++#define PG_OFFLINE_STATUS_OFFLINED              (0x1UL << 1)
++#define PG_OFFLINE_STATUS_ONLINE                (0x1UL << 2)
++#define PG_OFFLINE_STATUS_OFFLINE_PENDING       (0x1UL << 3)
++#define PG_OFFLINE_STATUS_BROKEN                (0x1UL << 4)
++
++#define PG_OFFLINE_MISC_MASK    (0xFFUL << 4)
++
++/* valid when PG_OFFLINE_FAILED or PG_OFFLINE_PENDING */
++#define PG_OFFLINE_XENPAGE   (0x1UL << 8)
++#define PG_OFFLINE_DOM0PAGE  (0x1UL << 9)
++#define PG_OFFLINE_ANONYMOUS (0x1UL << 10)
++#define PG_OFFLINE_NOT_CONV_RAM   (0x1UL << 11)
++#define PG_OFFLINE_OWNED     (0x1UL << 12)
++
++#define PG_OFFLINE_BROKEN    (0x1UL << 13)
++#define PG_ONLINE_BROKEN     PG_OFFLINE_BROKEN
++
++#define PG_OFFLINE_OWNER_SHIFT 16
++
++/* XEN_SYSCTL_lockprof_op */
++/* Sub-operations: */
++#define XEN_SYSCTL_LOCKPROF_reset 1   /* Reset all profile data to zero. */
++#define XEN_SYSCTL_LOCKPROF_query 2   /* Get lock profile information. */
++/* Record-type: */
++#define LOCKPROF_TYPE_GLOBAL      0   /* global lock, idx meaningless */
++#define LOCKPROF_TYPE_PERDOM      1   /* per-domain lock, idx is domid */
++#define LOCKPROF_TYPE_N           2   /* number of types */
++struct xen_sysctl_lockprof_data {
++    char     name[40];     /* lock name (may include up to 2 %d specifiers) */
++    int32_t  type;         /* LOCKPROF_TYPE_??? */
++    int32_t  idx;          /* index (e.g. domain id) */
++    uint64_aligned_t lock_cnt;     /* # of locking succeeded */
++    uint64_aligned_t block_cnt;    /* # of wait for lock */
++    uint64_aligned_t lock_time;    /* nsecs lock held */
++    uint64_aligned_t block_time;   /* nsecs waited for lock */
++};
++typedef struct xen_sysctl_lockprof_data xen_sysctl_lockprof_data_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_data_t);
++struct xen_sysctl_lockprof_op {
++    /* IN variables. */
++    uint32_t       cmd;               /* XEN_SYSCTL_LOCKPROF_??? */
++    uint32_t       max_elem;          /* size of output buffer */
++    /* OUT variables (query only). */
++    uint32_t       nr_elem;           /* number of elements available */
++    uint64_aligned_t time;            /* nsecs of profile measurement */
++    /* profile information (or NULL) */
++    XEN_GUEST_HANDLE_64(xen_sysctl_lockprof_data_t) data;
++};
++typedef struct xen_sysctl_lockprof_op xen_sysctl_lockprof_op_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_op_t);
++
++/* XEN_SYSCTL_topologyinfo */
++#define INVALID_TOPOLOGY_ID  (~0U)
++struct xen_sysctl_topologyinfo {
++    /*
++     * IN: maximum addressable entry in the caller-provided arrays.
++     * OUT: largest cpu identifier in the system.
++     * If OUT is greater than IN then the arrays are truncated!
++     * If OUT is leass than IN then the array tails are not written by sysctl.
++     */
++    uint32_t max_cpu_index;
++
++    /*
++     * If not NULL, these arrays are filled with core/socket/node identifier
++     * for each cpu.
++     * If a cpu has no core/socket/node information (e.g., cpu not present) 
++     * then the sentinel value ~0u is written to each array.
++     * The number of array elements written by the sysctl is:
++     *   min(@max_cpu_index_IN,@max_cpu_index_OUT)+1
++     */
++    XEN_GUEST_HANDLE_64(uint32) cpu_to_core;
++    XEN_GUEST_HANDLE_64(uint32) cpu_to_socket;
++    XEN_GUEST_HANDLE_64(uint32) cpu_to_node;
++};
++typedef struct xen_sysctl_topologyinfo xen_sysctl_topologyinfo_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_topologyinfo_t);
++
++/* XEN_SYSCTL_numainfo */
++struct xen_sysctl_numainfo {
++    /*
++     * IN: maximum addressable entry in the caller-provided arrays.
++     * OUT: largest node identifier in the system.
++     * If OUT is greater than IN then the arrays are truncated!
++     */
++    uint32_t max_node_index;
++
++    /* NB. Entries are 0 if node is not present. */
++    XEN_GUEST_HANDLE_64(uint64) node_to_memsize;
++    XEN_GUEST_HANDLE_64(uint64) node_to_memfree;
++
++    /*
++     * Array, of size (max_node_index+1)^2, listing memory access distances
++     * between nodes. If an entry has no node distance information (e.g., node 
++     * not present) then the value ~0u is written.
++     * 
++     * Note that the array rows must be indexed by multiplying by the minimum 
++     * of the caller-provided max_node_index and the returned value of
++     * max_node_index. That is, if the largest node index in the system is
++     * smaller than the caller can handle, a smaller 2-d array is constructed
++     * within the space provided by the caller. When this occurs, trailing
++     * space provided by the caller is not modified. If the largest node index
++     * in the system is larger than the caller can handle, then a 2-d array of
++     * the maximum size handleable by the caller is constructed.
++     */
++    XEN_GUEST_HANDLE_64(uint32) node_to_node_distance;
++};
++typedef struct xen_sysctl_numainfo xen_sysctl_numainfo_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_numainfo_t);
++
++/* XEN_SYSCTL_cpupool_op */
++#define XEN_SYSCTL_CPUPOOL_OP_CREATE                1  /* C */
++#define XEN_SYSCTL_CPUPOOL_OP_DESTROY               2  /* D */
++#define XEN_SYSCTL_CPUPOOL_OP_INFO                  3  /* I */
++#define XEN_SYSCTL_CPUPOOL_OP_ADDCPU                4  /* A */
++#define XEN_SYSCTL_CPUPOOL_OP_RMCPU                 5  /* R */
++#define XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN            6  /* M */
++#define XEN_SYSCTL_CPUPOOL_OP_FREEINFO              7  /* F */
++#define XEN_SYSCTL_CPUPOOL_PAR_ANY     0xFFFFFFFF
++struct xen_sysctl_cpupool_op {
++    uint32_t op;          /* IN */
++    uint32_t cpupool_id;  /* IN: CDIARM OUT: CI */
++    uint32_t sched_id;    /* IN: C      OUT: I  */
++    uint32_t domid;       /* IN: M              */
++    uint32_t cpu;         /* IN: AR             */
++    uint32_t n_dom;       /*            OUT: I  */
++    struct xenctl_cpumap cpumap; /*     OUT: IF */
++};
++typedef struct xen_sysctl_cpupool_op xen_sysctl_cpupool_op_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpupool_op_t);
++
++#define ARINC653_MAX_DOMAINS_PER_SCHEDULE   64
++/*
++ * This structure is used to pass a new ARINC653 schedule from a
++ * privileged domain (ie dom0) to Xen.
++ */
++struct xen_sysctl_arinc653_schedule {
++    /* major_frame holds the time for the new schedule's major frame
++     * in nanoseconds. */
++    uint64_aligned_t     major_frame;
++    /* num_sched_entries holds how many of the entries in the
++     * sched_entries[] array are valid. */
++    uint8_t     num_sched_entries;
++    /* The sched_entries array holds the actual schedule entries. */
++    struct {
++        /* dom_handle must match a domain's UUID */
++        xen_domain_handle_t dom_handle;
++        /* If a domain has multiple VCPUs, vcpu_id specifies which one
++         * this schedule entry applies to. It should be set to 0 if
++         * there is only one VCPU for the domain. */
++        unsigned int vcpu_id;
++        /* runtime specifies the amount of time that should be allocated
++         * to this VCPU per major frame. It is specified in nanoseconds */
++        uint64_aligned_t runtime;
++    } sched_entries[ARINC653_MAX_DOMAINS_PER_SCHEDULE];
++};
++typedef struct xen_sysctl_arinc653_schedule xen_sysctl_arinc653_schedule_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_arinc653_schedule_t);
++
++/* XEN_SYSCTL_scheduler_op */
++/* Set or get info? */
++#define XEN_SYSCTL_SCHEDOP_putinfo 0
++#define XEN_SYSCTL_SCHEDOP_getinfo 1
++struct xen_sysctl_scheduler_op {
++    uint32_t cpupool_id; /* Cpupool whose scheduler is to be targetted. */
++    uint32_t sched_id;   /* XEN_SCHEDULER_* (domctl.h) */
++    uint32_t cmd;        /* XEN_SYSCTL_SCHEDOP_* */
++    union {
++        struct xen_sysctl_sched_arinc653 {
++            XEN_GUEST_HANDLE_64(xen_sysctl_arinc653_schedule_t) schedule;
++        } sched_arinc653;
++    } u;
++};
++typedef struct xen_sysctl_scheduler_op xen_sysctl_scheduler_op_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_scheduler_op_t);
++
++struct xen_sysctl {
++    uint32_t cmd;
++#define XEN_SYSCTL_readconsole                    1
++#define XEN_SYSCTL_tbuf_op                        2
++#define XEN_SYSCTL_physinfo                       3
++#define XEN_SYSCTL_sched_id                       4
++#define XEN_SYSCTL_perfc_op                       5
++#define XEN_SYSCTL_getdomaininfolist              6
++#define XEN_SYSCTL_debug_keys                     7
++#define XEN_SYSCTL_getcpuinfo                     8
++#define XEN_SYSCTL_availheap                      9
++#define XEN_SYSCTL_get_pmstat                    10
++#define XEN_SYSCTL_cpu_hotplug                   11
++#define XEN_SYSCTL_pm_op                         12
++#define XEN_SYSCTL_page_offline_op               14
++#define XEN_SYSCTL_lockprof_op                   15
++#define XEN_SYSCTL_topologyinfo                  16 
++#define XEN_SYSCTL_numainfo                      17
++#define XEN_SYSCTL_cpupool_op                    18
++#define XEN_SYSCTL_scheduler_op                  19
++    uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
++    union {
++        struct xen_sysctl_readconsole       readconsole;
++        struct xen_sysctl_tbuf_op           tbuf_op;
++        struct xen_sysctl_physinfo          physinfo;
++        struct xen_sysctl_topologyinfo      topologyinfo;
++        struct xen_sysctl_numainfo          numainfo;
++        struct xen_sysctl_sched_id          sched_id;
++        struct xen_sysctl_perfc_op          perfc_op;
++        struct xen_sysctl_getdomaininfolist getdomaininfolist;
++        struct xen_sysctl_debug_keys        debug_keys;
++        struct xen_sysctl_getcpuinfo        getcpuinfo;
++        struct xen_sysctl_availheap         availheap;
++        struct xen_sysctl_get_pmstat        get_pmstat;
++        struct xen_sysctl_cpu_hotplug       cpu_hotplug;
++        struct xen_sysctl_pm_op             pm_op;
++        struct xen_sysctl_page_offline_op   page_offline;
++        struct xen_sysctl_lockprof_op       lockprof_op;
++        struct xen_sysctl_cpupool_op        cpupool_op;
++        struct xen_sysctl_scheduler_op      scheduler_op;
++        uint8_t                             pad[128];
++    } u;
++};
++typedef struct xen_sysctl xen_sysctl_t;
++DEFINE_XEN_GUEST_HANDLE(xen_sysctl_t);
++
++#endif /* __XEN_PUBLIC_SYSCTL_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/tmem.h

index 0000000,0000000..74bd1c6

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/tmem.h
@@@ -1,0 -1,0 +1,148 @@@
++/******************************************************************************
++ * tmem.h
++ * 
++ * Guest OS interface to Xen Transcendent Memory.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2004, K A Fraser
++ */
++
++#ifndef __XEN_PUBLIC_TMEM_H__
++#define __XEN_PUBLIC_TMEM_H__
++
++#include "xen.h"
++
++/* version of ABI */
++#define TMEM_SPEC_VERSION          1
++
++/* Commands to HYPERVISOR_tmem_op() */
++#define TMEM_CONTROL               0
++#define TMEM_NEW_POOL              1
++#define TMEM_DESTROY_POOL          2
++#define TMEM_NEW_PAGE              3
++#define TMEM_PUT_PAGE              4
++#define TMEM_GET_PAGE              5
++#define TMEM_FLUSH_PAGE            6
++#define TMEM_FLUSH_OBJECT          7
++#define TMEM_READ                  8
++#define TMEM_WRITE                 9
++#define TMEM_XCHG                 10
++
++/* Privileged commands to HYPERVISOR_tmem_op() */
++#define TMEM_AUTH                 101 
++#define TMEM_RESTORE_NEW          102
++
++/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */
++#define TMEMC_THAW                   0
++#define TMEMC_FREEZE                 1
++#define TMEMC_FLUSH                  2
++#define TMEMC_DESTROY                3
++#define TMEMC_LIST                   4
++#define TMEMC_SET_WEIGHT             5
++#define TMEMC_SET_CAP                6
++#define TMEMC_SET_COMPRESS           7
++#define TMEMC_QUERY_FREEABLE_MB      8
++#define TMEMC_SAVE_BEGIN             10
++#define TMEMC_SAVE_GET_VERSION       11
++#define TMEMC_SAVE_GET_MAXPOOLS      12
++#define TMEMC_SAVE_GET_CLIENT_WEIGHT 13
++#define TMEMC_SAVE_GET_CLIENT_CAP    14
++#define TMEMC_SAVE_GET_CLIENT_FLAGS  15
++#define TMEMC_SAVE_GET_POOL_FLAGS    16
++#define TMEMC_SAVE_GET_POOL_NPAGES   17
++#define TMEMC_SAVE_GET_POOL_UUID     18
++#define TMEMC_SAVE_GET_NEXT_PAGE     19
++#define TMEMC_SAVE_GET_NEXT_INV      20
++#define TMEMC_SAVE_END               21
++#define TMEMC_RESTORE_BEGIN          30
++#define TMEMC_RESTORE_PUT_PAGE       32
++#define TMEMC_RESTORE_FLUSH_PAGE     33
++
++/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
++#define TMEM_POOL_PERSIST          1
++#define TMEM_POOL_SHARED           2
++#define TMEM_POOL_PRECOMPRESSED    4
++#define TMEM_POOL_PAGESIZE_SHIFT   4
++#define TMEM_POOL_PAGESIZE_MASK  0xf
++#define TMEM_POOL_VERSION_SHIFT   24
++#define TMEM_POOL_VERSION_MASK  0xff
++#define TMEM_POOL_RESERVED_BITS  0x00ffff00
++
++/* Bits for client flags (save/restore) */
++#define TMEM_CLIENT_COMPRESS       1
++#define TMEM_CLIENT_FROZEN         2
++
++/* Special errno values */
++#define EFROZEN                 1000
++#define EEMPTY                  1001
++
++
++#ifndef __ASSEMBLY__
++typedef xen_pfn_t tmem_cli_mfn_t;
++typedef XEN_GUEST_HANDLE(char) tmem_cli_va_t;
++struct tmem_op {
++    uint32_t cmd;
++    int32_t pool_id;
++    union {
++        struct {
++            uint64_t uuid[2];
++            uint32_t flags;
++            uint32_t arg1;
++        } creat; /* for cmd == TMEM_NEW_POOL, TMEM_AUTH, TMEM_RESTORE_NEW */
++        struct { 
++            uint32_t subop;
++            uint32_t cli_id;
++            uint32_t arg1;
++            uint32_t arg2;
++            uint64_t oid[3];
++            tmem_cli_va_t buf;
++        } ctrl; /* for cmd == TMEM_CONTROL */
++        struct {
++            
++            uint64_t oid[3];
++            uint32_t index;
++            uint32_t tmem_offset;
++            uint32_t pfn_offset;
++            uint32_t len;
++            tmem_cli_mfn_t cmfn; /* client machine page frame */
++        } gen; /* for all other cmd ("generic") */
++    } u;
++};
++typedef struct tmem_op tmem_op_t;
++DEFINE_XEN_GUEST_HANDLE(tmem_op_t);
++
++struct tmem_handle {
++    uint32_t pool_id;
++    uint32_t index;
++    uint64_t oid[3];
++};
++#endif
++
++#endif /* __XEN_PUBLIC_TMEM_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/trace.h

index 0000000,0000000..9c321e7

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/trace.h
@@@ -1,0 -1,0 +1,231 @@@
++/******************************************************************************
++ * include/public/trace.h
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Mark Williamson, (C) 2004 Intel Research Cambridge
++ * Copyright (C) 2005 Bin Ren
++ */
++
++#ifndef __XEN_PUBLIC_TRACE_H__
++#define __XEN_PUBLIC_TRACE_H__
++
++#define TRACE_EXTRA_MAX    7
++#define TRACE_EXTRA_SHIFT 28
++
++/* Trace classes */
++#define TRC_CLS_SHIFT 16
++#define TRC_GEN      0x0001f000    /* General trace            */
++#define TRC_SCHED    0x0002f000    /* Xen Scheduler trace      */
++#define TRC_DOM0OP   0x0004f000    /* Xen DOM0 operation trace */
++#define TRC_HVM      0x0008f000    /* Xen HVM trace            */
++#define TRC_MEM      0x0010f000    /* Xen memory trace         */
++#define TRC_PV       0x0020f000    /* Xen PV traces            */
++#define TRC_SHADOW   0x0040f000    /* Xen shadow tracing       */
++#define TRC_PM       0x0080f000    /* Xen power management trace */
++#define TRC_GUEST    0x0800f000    /* Guest-generated traces   */
++#define TRC_ALL      0x0ffff000
++#define TRC_HD_TO_EVENT(x) ((x)&0x0fffffff)
++#define TRC_HD_CYCLE_FLAG (1UL<<31)
++#define TRC_HD_INCLUDES_CYCLE_COUNT(x) ( !!( (x) & TRC_HD_CYCLE_FLAG ) )
++#define TRC_HD_EXTRA(x)    (((x)>>TRACE_EXTRA_SHIFT)&TRACE_EXTRA_MAX)
++
++/* Trace subclasses */
++#define TRC_SUBCLS_SHIFT 12
++
++/* trace subclasses for SVM */
++#define TRC_HVM_ENTRYEXIT 0x00081000   /* VMENTRY and #VMEXIT       */
++#define TRC_HVM_HANDLER   0x00082000   /* various HVM handlers      */
++
++#define TRC_SCHED_MIN       0x00021000   /* Just runstate changes */
++#define TRC_SCHED_CLASS     0x00022000   /* Scheduler-specific    */
++#define TRC_SCHED_VERBOSE   0x00028000   /* More inclusive scheduling */
++
++/* Trace events per class */
++#define TRC_LOST_RECORDS        (TRC_GEN + 1)
++#define TRC_TRACE_WRAP_BUFFER  (TRC_GEN + 2)
++#define TRC_TRACE_CPU_CHANGE    (TRC_GEN + 3)
++#define TRC_TRACE_IRQ           (TRC_GEN + 4)
++
++#define TRC_SCHED_RUNSTATE_CHANGE   (TRC_SCHED_MIN + 1)
++#define TRC_SCHED_CONTINUE_RUNNING  (TRC_SCHED_MIN + 2)
++#define TRC_SCHED_DOM_ADD        (TRC_SCHED_VERBOSE +  1)
++#define TRC_SCHED_DOM_REM        (TRC_SCHED_VERBOSE +  2)
++#define TRC_SCHED_SLEEP          (TRC_SCHED_VERBOSE +  3)
++#define TRC_SCHED_WAKE           (TRC_SCHED_VERBOSE +  4)
++#define TRC_SCHED_YIELD          (TRC_SCHED_VERBOSE +  5)
++#define TRC_SCHED_BLOCK          (TRC_SCHED_VERBOSE +  6)
++#define TRC_SCHED_SHUTDOWN       (TRC_SCHED_VERBOSE +  7)
++#define TRC_SCHED_CTL            (TRC_SCHED_VERBOSE +  8)
++#define TRC_SCHED_ADJDOM         (TRC_SCHED_VERBOSE +  9)
++#define TRC_SCHED_SWITCH         (TRC_SCHED_VERBOSE + 10)
++#define TRC_SCHED_S_TIMER_FN     (TRC_SCHED_VERBOSE + 11)
++#define TRC_SCHED_T_TIMER_FN     (TRC_SCHED_VERBOSE + 12)
++#define TRC_SCHED_DOM_TIMER_FN   (TRC_SCHED_VERBOSE + 13)
++#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED_VERBOSE + 14)
++#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED_VERBOSE + 15)
++#define TRC_SCHED_SHUTDOWN_CODE  (TRC_SCHED_VERBOSE + 16)
++
++#define TRC_MEM_PAGE_GRANT_MAP      (TRC_MEM + 1)
++#define TRC_MEM_PAGE_GRANT_UNMAP    (TRC_MEM + 2)
++#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3)
++#define TRC_MEM_SET_P2M_ENTRY       (TRC_MEM + 4)
++#define TRC_MEM_DECREASE_RESERVATION (TRC_MEM + 5)
++#define TRC_MEM_POD_POPULATE        (TRC_MEM + 16)
++#define TRC_MEM_POD_ZERO_RECLAIM    (TRC_MEM + 17)
++#define TRC_MEM_POD_SUPERPAGE_SPLINTER (TRC_MEM + 18)
++
++
++#define TRC_PV_HYPERCALL             (TRC_PV +  1)
++#define TRC_PV_TRAP                  (TRC_PV +  3)
++#define TRC_PV_PAGE_FAULT            (TRC_PV +  4)
++#define TRC_PV_FORCED_INVALID_OP     (TRC_PV +  5)
++#define TRC_PV_EMULATE_PRIVOP        (TRC_PV +  6)
++#define TRC_PV_EMULATE_4GB           (TRC_PV +  7)
++#define TRC_PV_MATH_STATE_RESTORE    (TRC_PV +  8)
++#define TRC_PV_PAGING_FIXUP          (TRC_PV +  9)
++#define TRC_PV_GDT_LDT_MAPPING_FAULT (TRC_PV + 10)
++#define TRC_PV_PTWR_EMULATION        (TRC_PV + 11)
++#define TRC_PV_PTWR_EMULATION_PAE    (TRC_PV + 12)
++  /* Indicates that addresses in trace record are 64 bits */
++#define TRC_64_FLAG               (0x100) 
++
++#define TRC_SHADOW_NOT_SHADOW                 (TRC_SHADOW +  1)
++#define TRC_SHADOW_FAST_PROPAGATE             (TRC_SHADOW +  2)
++#define TRC_SHADOW_FAST_MMIO                  (TRC_SHADOW +  3)
++#define TRC_SHADOW_FALSE_FAST_PATH            (TRC_SHADOW +  4)
++#define TRC_SHADOW_MMIO                       (TRC_SHADOW +  5)
++#define TRC_SHADOW_FIXUP                      (TRC_SHADOW +  6)
++#define TRC_SHADOW_DOMF_DYING                 (TRC_SHADOW +  7)
++#define TRC_SHADOW_EMULATE                    (TRC_SHADOW +  8)
++#define TRC_SHADOW_EMULATE_UNSHADOW_USER      (TRC_SHADOW +  9)
++#define TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ    (TRC_SHADOW + 10)
++#define TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED (TRC_SHADOW + 11)
++#define TRC_SHADOW_WRMAP_BF                   (TRC_SHADOW + 12)
++#define TRC_SHADOW_PREALLOC_UNPIN             (TRC_SHADOW + 13)
++#define TRC_SHADOW_RESYNC_FULL                (TRC_SHADOW + 14)
++#define TRC_SHADOW_RESYNC_ONLY                (TRC_SHADOW + 15)
++
++/* trace events per subclass */
++#define TRC_HVM_NESTEDFLAG      (0x400)
++#define TRC_HVM_VMENTRY         (TRC_HVM_ENTRYEXIT + 0x01)
++#define TRC_HVM_VMEXIT          (TRC_HVM_ENTRYEXIT + 0x02)
++#define TRC_HVM_VMEXIT64        (TRC_HVM_ENTRYEXIT + TRC_64_FLAG + 0x02)
++#define TRC_HVM_PF_XEN          (TRC_HVM_HANDLER + 0x01)
++#define TRC_HVM_PF_XEN64        (TRC_HVM_HANDLER + TRC_64_FLAG + 0x01)
++#define TRC_HVM_PF_INJECT       (TRC_HVM_HANDLER + 0x02)
++#define TRC_HVM_PF_INJECT64     (TRC_HVM_HANDLER + TRC_64_FLAG + 0x02)
++#define TRC_HVM_INJ_EXC         (TRC_HVM_HANDLER + 0x03)
++#define TRC_HVM_INJ_VIRQ        (TRC_HVM_HANDLER + 0x04)
++#define TRC_HVM_REINJ_VIRQ      (TRC_HVM_HANDLER + 0x05)
++#define TRC_HVM_IO_READ         (TRC_HVM_HANDLER + 0x06)
++#define TRC_HVM_IO_WRITE        (TRC_HVM_HANDLER + 0x07)
++#define TRC_HVM_CR_READ         (TRC_HVM_HANDLER + 0x08)
++#define TRC_HVM_CR_READ64       (TRC_HVM_HANDLER + TRC_64_FLAG + 0x08)
++#define TRC_HVM_CR_WRITE        (TRC_HVM_HANDLER + 0x09)
++#define TRC_HVM_CR_WRITE64      (TRC_HVM_HANDLER + TRC_64_FLAG + 0x09)
++#define TRC_HVM_DR_READ         (TRC_HVM_HANDLER + 0x0A)
++#define TRC_HVM_DR_WRITE        (TRC_HVM_HANDLER + 0x0B)
++#define TRC_HVM_MSR_READ        (TRC_HVM_HANDLER + 0x0C)
++#define TRC_HVM_MSR_WRITE       (TRC_HVM_HANDLER + 0x0D)
++#define TRC_HVM_CPUID           (TRC_HVM_HANDLER + 0x0E)
++#define TRC_HVM_INTR            (TRC_HVM_HANDLER + 0x0F)
++#define TRC_HVM_NMI             (TRC_HVM_HANDLER + 0x10)
++#define TRC_HVM_SMI             (TRC_HVM_HANDLER + 0x11)
++#define TRC_HVM_VMMCALL         (TRC_HVM_HANDLER + 0x12)
++#define TRC_HVM_HLT             (TRC_HVM_HANDLER + 0x13)
++#define TRC_HVM_INVLPG          (TRC_HVM_HANDLER + 0x14)
++#define TRC_HVM_INVLPG64        (TRC_HVM_HANDLER + TRC_64_FLAG + 0x14)
++#define TRC_HVM_MCE             (TRC_HVM_HANDLER + 0x15)
++#define TRC_HVM_IOPORT_READ     (TRC_HVM_HANDLER + 0x16)
++#define TRC_HVM_IOMEM_READ      (TRC_HVM_HANDLER + 0x17)
++#define TRC_HVM_CLTS            (TRC_HVM_HANDLER + 0x18)
++#define TRC_HVM_LMSW            (TRC_HVM_HANDLER + 0x19)
++#define TRC_HVM_LMSW64          (TRC_HVM_HANDLER + TRC_64_FLAG + 0x19)
++#define TRC_HVM_RDTSC           (TRC_HVM_HANDLER + 0x1a)
++#define TRC_HVM_INTR_WINDOW     (TRC_HVM_HANDLER + 0x20)
++#define TRC_HVM_NPF             (TRC_HVM_HANDLER + 0x21)
++
++#define TRC_HVM_IOPORT_WRITE    (TRC_HVM_HANDLER + 0x216)
++#define TRC_HVM_IOMEM_WRITE     (TRC_HVM_HANDLER + 0x217)
++
++/* trace subclasses for power management */
++#define TRC_PM_FREQ     0x00801000      /* xen cpu freq events */
++#define TRC_PM_IDLE     0x00802000      /* xen cpu idle events */
++
++/* trace events for per class */
++#define TRC_PM_FREQ_CHANGE      (TRC_PM_FREQ + 0x01)
++#define TRC_PM_IDLE_ENTRY       (TRC_PM_IDLE + 0x01)
++#define TRC_PM_IDLE_EXIT        (TRC_PM_IDLE + 0x02)
++
++/* This structure represents a single trace buffer record. */
++struct t_rec {
++    uint32_t event:28;
++    uint32_t extra_u32:3;         /* # entries in trailing extra_u32[] array */
++    uint32_t cycles_included:1;   /* u.cycles or u.no_cycles? */
++    union {
++        struct {
++            uint32_t cycles_lo, cycles_hi; /* cycle counter timestamp */
++            uint32_t extra_u32[7];         /* event data items */
++        } cycles;
++        struct {
++            uint32_t extra_u32[7];         /* event data items */
++        } nocycles;
++    } u;
++};
++
++/*
++ * This structure contains the metadata for a single trace buffer.  The head
++ * field, indexes into an array of struct t_rec's.
++ */
++struct t_buf {
++    /* Assume the data buffer size is X.  X is generally not a power of 2.
++     * CONS and PROD are incremented modulo (2*X):
++     *     0 <= cons < 2*X
++     *     0 <= prod < 2*X
++     * This is done because addition modulo X breaks at 2^32 when X is not a
++     * power of 2:
++     *     (((2^32 - 1) % X) + 1) % X != (2^32) % X
++     */
++    uint32_t cons;   /* Offset of next item to be consumed by control tools. */
++    uint32_t prod;   /* Offset of next item to be produced by Xen.           */
++    /*  Records follow immediately after the meta-data header.    */
++};
++
++/* Structure used to pass MFNs to the trace buffers back to trace consumers.
++ * Offset is an offset into the mapped structure where the mfn list will be held.
++ * MFNs will be at ((unsigned long *)(t_info))+(t_info->cpu_offset[cpu]).
++ */
++struct t_info {
++    uint16_t tbuf_size; /* Size in pages of each trace buffer */
++    uint16_t mfn_offset[];  /* Offset within t_info structure of the page list per cpu */
++    /* MFN lists immediately after the header */
++};
++
++#endif /* __XEN_PUBLIC_TRACE_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/vcpu.h

index 87e6f8a,87e6f8a..a4f2af0
--- 1/include/xen/interface/vcpu.h
--- 2/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@@ -27,11 -27,11 +27,13 @@@
   #ifndef __XEN_PUBLIC_VCPU_H__
   #define __XEN_PUBLIC_VCPU_H__
   
++#include "xen.h"
++
   /*
    * Prototype for this hypercall is:
-- *    int vcpu_op(int cmd, int vcpuid, void *extra_args)
-- * @cmd                  == VCPUOP_??? (VCPU operation).
-- * @vcpuid       == VCPU to operate on.
++ *  int vcpu_op(int cmd, int vcpuid, void *extra_args)
++ * @cmd        == VCPUOP_??? (VCPU operation).
++ * @vcpuid     == VCPU to operate on.
    * @extra_args == Operation-specific extra arguments (NULL if none).
    */
   
@@@ -40,52 -40,52 +42,54 @@@
    * newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
    *
    * @extra_arg == pointer to vcpu_guest_context structure containing initial
-- *                             state for the VCPU.
++ *               state for the VCPU.
    */
--#define VCPUOP_initialise                      0
++#define VCPUOP_initialise            0
   
   /*
    * Bring up a VCPU. This makes the VCPU runnable. This operation will fail
    * if the VCPU has not been initialised (VCPUOP_initialise).
    */
--#define VCPUOP_up                                      1
++#define VCPUOP_up                    1
   
   /*
    * Bring down a VCPU (i.e., make it non-runnable).
    * There are a few caveats that callers should observe:
-- *    1. This operation may return, and VCPU_is_up may return false, before the
-- *       VCPU stops running (i.e., the command is asynchronous). It is a good
-- *       idea to ensure that the VCPU has entered a non-critical loop before
-- *       bringing it down. Alternatively, this operation is guaranteed
-- *       synchronous if invoked by the VCPU itself.
-- *    2. After a VCPU is initialised, there is currently no way to drop all its
-- *       references to domain memory. Even a VCPU that is down still holds
-- *       memory references via its pagetable base pointer and GDT. It is good
-- *       practise to move a VCPU onto an 'idle' or default page table, LDT and
-- *       GDT before bringing it down.
++ *  1. This operation may return, and VCPU_is_up may return false, before the
++ *     VCPU stops running (i.e., the command is asynchronous). It is a good
++ *     idea to ensure that the VCPU has entered a non-critical loop before
++ *     bringing it down. Alternatively, this operation is guaranteed
++ *     synchronous if invoked by the VCPU itself.
++ *  2. After a VCPU is initialised, there is currently no way to drop all its
++ *     references to domain memory. Even a VCPU that is down still holds
++ *     memory references via its pagetable base pointer and GDT. It is good
++ *     practise to move a VCPU onto an 'idle' or default page table, LDT and
++ *     GDT before bringing it down.
    */
--#define VCPUOP_down                                    2
++#define VCPUOP_down                  2
   
   /* Returns 1 if the given VCPU is up. */
--#define VCPUOP_is_up                           3
++#define VCPUOP_is_up                 3
   
   /*
    * Return information about the state and running time of a VCPU.
    * @extra_arg == pointer to vcpu_runstate_info structure.
    */
--#define VCPUOP_get_runstate_info       4
++#define VCPUOP_get_runstate_info     4
   struct vcpu_runstate_info {
--              /* VCPU's current state (RUNSTATE_*). */
--              int              state;
--              /* When was current state entered (system time, ns)? */
--              uint64_t state_entry_time;
--              /*
--               * Time spent in each RUNSTATE_* (ns). The sum of these times is
--               * guaranteed not to drift from system time.
--               */
--              uint64_t time[4];
++    /* VCPU's current state (RUNSTATE_*). */
++    int      state;
++    /* When was current state entered (system time, ns)? */
++    uint64_t state_entry_time;
++    /*
++     * Time spent in each RUNSTATE_* (ns). The sum of these times is
++     * guaranteed not to drift from system time.
++     */
++    uint64_t time[4];
   };
   DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
++typedef struct vcpu_runstate_info vcpu_runstate_info_t;
++DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t);
   
   /* VCPU is currently running on a physical CPU. */
   #define RUNSTATE_running  0
@@@ -108,47 -108,47 +112,54 @@@
    * Register a shared memory area from which the guest may obtain its own
    * runstate information without needing to execute a hypercall.
    * Notes:
-- *    1. The registered address may be virtual or physical, depending on the
-- *       platform. The virtual address should be registered on x86 systems.
-- *    2. Only one shared area may be registered per VCPU. The shared area is
-- *       updated by the hypervisor each time the VCPU is scheduled. Thus
-- *       runstate.state will always be RUNSTATE_running and
-- *       runstate.state_entry_time will indicate the system time at which the
-- *       VCPU was last scheduled to run.
++ *  1. The registered address may be virtual or physical or guest handle,
++ *     depending on the platform. Virtual address or guest handle should be
++ *     registered on x86 systems.
++ *  2. Only one shared area may be registered per VCPU. The shared area is
++ *     updated by the hypervisor each time the VCPU is scheduled. Thus
++ *     runstate.state will always be RUNSTATE_running and
++ *     runstate.state_entry_time will indicate the system time at which the
++ *     VCPU was last scheduled to run.
    * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
    */
   #define VCPUOP_register_runstate_memory_area 5
   struct vcpu_register_runstate_memory_area {
--              union {
--                              GUEST_HANDLE(vcpu_runstate_info) h;
--                              struct vcpu_runstate_info *v;
--                              uint64_t p;
--              } addr;
++    union {
++        XEN_GUEST_HANDLE(vcpu_runstate_info_t) h;
++        struct vcpu_runstate_info *v;
++        uint64_t p;
++    } addr;
   };
++typedef struct vcpu_register_runstate_memory_area vcpu_register_runstate_memory_area_t;
++DEFINE_XEN_GUEST_HANDLE(vcpu_register_runstate_memory_area_t);
   
   /*
    * Set or stop a VCPU's periodic timer. Every VCPU has one periodic timer
    * which can be set via these commands. Periods smaller than one millisecond
    * may not be supported.
    */
--#define VCPUOP_set_periodic_timer      6 /* arg == vcpu_set_periodic_timer_t */
--#define VCPUOP_stop_periodic_timer     7 /* arg == NULL */
++#define VCPUOP_set_periodic_timer    6 /* arg == vcpu_set_periodic_timer_t */
++#define VCPUOP_stop_periodic_timer   7 /* arg == NULL */
   struct vcpu_set_periodic_timer {
--              uint64_t period_ns;
++    uint64_t period_ns;
   };
   DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer);
++typedef struct vcpu_set_periodic_timer vcpu_set_periodic_timer_t;
++DEFINE_XEN_GUEST_HANDLE(vcpu_set_periodic_timer_t);
   
   /*
    * Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot
    * timer which can be set via these commands.
    */
--#define VCPUOP_set_singleshot_timer    8 /* arg == vcpu_set_singleshot_timer_t */
++#define VCPUOP_set_singleshot_timer  8 /* arg == vcpu_set_singleshot_timer_t */
   #define VCPUOP_stop_singleshot_timer 9 /* arg == NULL */
   struct vcpu_set_singleshot_timer {
--              uint64_t timeout_abs_ns;
--              uint32_t flags;                    /* VCPU_SSHOTTMR_??? */
++    uint64_t timeout_abs_ns;   /* Absolute system time value in nanoseconds. */
++    uint32_t flags;            /* VCPU_SSHOTTMR_??? */
   };
   DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
++typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t;
++DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t);
   
   /* Flags to VCPUOP_set_singleshot_timer. */
    /* Require the timeout to be in the future (return -ETIME if it's passed). */
@@@ -161,13 -161,13 +172,63 @@@
    * structure in a convenient place, such as in a per-cpu data area.
    * The pointer need not be page aligned, but the structure must not
    * cross a page boundary.
++ *
++ * This may be called only once per vcpu.
    */
--#define VCPUOP_register_vcpu_info   10  /* arg == struct vcpu_info */
++#define VCPUOP_register_vcpu_info   10  /* arg == vcpu_register_vcpu_info_t */
   struct vcpu_register_vcpu_info {
       uint64_t mfn;    /* mfn of page to place vcpu_info */
       uint32_t offset; /* offset within page */
       uint32_t rsvd;   /* unused */
   };
   DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
++typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t;
++DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t);
++
++/* Send an NMI to the specified VCPU. @extra_arg == NULL. */
++#define VCPUOP_send_nmi             11
++
++/*
++ * Get the physical ID information for a pinned vcpu's underlying physical
++ * processor.  The physical ID informmation is architecture-specific.
++ * On x86: id[31:0]=apic_id, id[63:32]=acpi_id.
++ * This command returns -EINVAL if it is not a valid operation for this VCPU.
++ */
++#define VCPUOP_get_physid           12 /* arg == vcpu_get_physid_t */
++struct vcpu_get_physid {
++    uint64_t phys_id;
++};
++typedef struct vcpu_get_physid vcpu_get_physid_t;
++DEFINE_XEN_GUEST_HANDLE(vcpu_get_physid_t);
++#define xen_vcpu_physid_to_x86_apicid(physid) ((uint32_t)(physid))
++#define xen_vcpu_physid_to_x86_acpiid(physid) ((uint32_t)((physid) >> 32))
++
++/*
++ * Register a memory location to get a secondary copy of the vcpu time
++ * parameters.  The master copy still exists as part of the vcpu shared
++ * memory area, and this secondary copy is updated whenever the master copy
++ * is updated (and using the same versioning scheme for synchronisation).
++ *
++ * The intent is that this copy may be mapped (RO) into userspace so
++ * that usermode can compute system time using the time info and the
++ * tsc.  Usermode will see an array of vcpu_time_info structures, one
++ * for each vcpu, and choose the right one by an existing mechanism
++ * which allows it to get the current vcpu number (such as via a
++ * segment limit).  It can then apply the normal algorithm to compute
++ * system time from the tsc.
++ *
++ * @extra_arg == pointer to vcpu_register_time_info_memory_area structure.
++ */
++#define VCPUOP_register_vcpu_time_memory_area   13
++DEFINE_XEN_GUEST_HANDLE(vcpu_time_info_t);
++struct vcpu_register_time_memory_area {
++    union {
++        XEN_GUEST_HANDLE(vcpu_time_info_t) h;
++        struct vcpu_time_info *v;
++        uint64_t p;
++    } addr;
++};
++typedef struct vcpu_register_time_memory_area vcpu_register_time_memory_area_t;
++DEFINE_XEN_GUEST_HANDLE(vcpu_register_time_memory_area_t);
   
   #endif /* __XEN_PUBLIC_VCPU_H__ */
diff --cc include/xen/interface/version.h

index e8b6519,e8b6519..da54fd8
--- 1/include/xen/interface/version.h
--- 2/include/xen/interface/version.h
+++ b/include/xen/interface/version.h
@@@ -3,6 -3,6 +3,24 @@@
    *
    * Xen version, type, and compile information.
    *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
    * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
    */
@@@ -10,17 -10,17 +28,18 @@@
   #ifndef __XEN_PUBLIC_VERSION_H__
   #define __XEN_PUBLIC_VERSION_H__
   
--/* NB. All ops return zero on success, except XENVER_version. */
++/* NB. All ops return zero on success, except XENVER_{version,pagesize} */
   
   /* arg == NULL; returns major:minor (16:16). */
   #define XENVER_version      0
   
   /* arg == xen_extraversion_t. */
   #define XENVER_extraversion 1
++typedef char xen_extraversion_t[16];
   struct xen_extraversion {
--    char extraversion[16];
++    xen_extraversion_t extraversion;
   };
--#define XEN_EXTRAVERSION_LEN (sizeof(struct xen_extraversion))
++#define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t))
   
   /* arg == xen_compile_info_t. */
   #define XENVER_compile_info 2
@@@ -30,29 -30,29 +49,34 @@@ struct xen_compile_info 
       char compile_domain[32];
       char compile_date[32];
   };
++typedef struct xen_compile_info xen_compile_info_t;
   
   #define XENVER_capabilities 3
++typedef char xen_capabilities_info_t[1024];
   struct xen_capabilities_info {
--    char info[1024];
++    xen_capabilities_info_t info;
   };
--#define XEN_CAPABILITIES_INFO_LEN (sizeof(struct xen_capabilities_info))
++#define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t))
   
   #define XENVER_changeset 4
++typedef char xen_changeset_info_t[64];
   struct xen_changeset_info {
--    char info[64];
++    xen_changeset_info_t info;
   };
--#define XEN_CHANGESET_INFO_LEN (sizeof(struct xen_changeset_info))
++#define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t))
   
   #define XENVER_platform_parameters 5
   struct xen_platform_parameters {
       unsigned long virt_start;
   };
++typedef struct xen_platform_parameters xen_platform_parameters_t;
   
   #define XENVER_get_features 6
   struct xen_feature_info {
       unsigned int submap_idx;    /* IN: which 32-bit submap to return */
       uint32_t     submap;        /* OUT: 32-bit submap */
   };
++typedef struct xen_feature_info xen_feature_info_t;
   
   /* Declares the features reported by XENVER_get_features. */
   #include "features.h"
@@@ -60,4 -60,4 +84,10 @@@
   /* arg == NULL; returns host memory page size. */
   #define XENVER_pagesize 7
   
++/* arg == xen_domain_handle_t. */
++#define XENVER_guest_handle 8
++
++#define XENVER_commandline 9
++typedef char xen_commandline_t[1024];
++
   #endif /* __XEN_PUBLIC_VERSION_H__ */
diff --cc include/xen/interface/xen-compat.h

index 0000000,0000000..2e38003

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/xen-compat.h
@@@ -1,0 -1,0 +1,44 @@@
++/******************************************************************************
++ * xen-compat.h
++ * 
++ * Guest OS interface to Xen.  Compatibility layer.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2006, Christian Limpach
++ */
++
++#ifndef __XEN_PUBLIC_XEN_COMPAT_H__
++#define __XEN_PUBLIC_XEN_COMPAT_H__
++
++#define __XEN_LATEST_INTERFACE_VERSION__ 0x0003020a
++
++#if defined(__XEN__) || defined(__XEN_TOOLS__)
++/* Xen is built with matching headers and implements the latest interface. */
++#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__
++#elif !defined(__XEN_INTERFACE_VERSION__)
++/* Guests which do not specify a version get the legacy interface. */
++#define __XEN_INTERFACE_VERSION__ 0x00000000
++#endif
++
++#if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__
++#error "These header files do not support the requested interface version."
++#endif
++
++#endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */
diff --cc include/xen/interface/xen.h

index 70213b4,b33257b..ad5708c
--- 1/include/xen/interface/xen.h
--- 2/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@@ -3,35 -3,35 +3,71 @@@
    *
    * Guest OS interface to Xen.
    *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
    * Copyright (c) 2004, K A Fraser
    */
   
   #ifndef __XEN_PUBLIC_XEN_H__
   #define __XEN_PUBLIC_XEN_H__
   
--#include <asm/xen/interface.h>
++#include "xen-compat.h"
++#ifdef CONFIG_PARAVIRT_XEN
   #include <asm/pvclock-abi.h>
++#endif
   
--/*
-- * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
-- */
++#if defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
++#include <asm/xen/interface.h>
++#elif defined(__i386__) || defined(__x86_64__)
++#include "arch-x86/xen.h"
++#elif defined(__ia64__)
++#include "arch-ia64.h"
++#else
++#error "Unsupported architecture"
++#endif
++
++#ifndef __ASSEMBLY__
++/* Guest handles for primitive C types. */
++DEFINE_XEN_GUEST_HANDLE(char);
++__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
++DEFINE_XEN_GUEST_HANDLE(int);
++__DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
++DEFINE_XEN_GUEST_HANDLE(long);
++__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
++DEFINE_XEN_GUEST_HANDLE(void);
++
++DEFINE_XEN_GUEST_HANDLE(uint64_t);
++DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
++#endif
   
   /*
-- * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
-- *         EAX = return value
-- *         (argument registers may be clobbered on return)
-- * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6.
-- *         RAX = return value
-- *         (argument registers not clobbered on return; RCX, R11 are)
++ * HYPERCALLS
    */
++
   #define __HYPERVISOR_set_trap_table        0
   #define __HYPERVISOR_mmu_update            1
   #define __HYPERVISOR_set_gdt               2
   #define __HYPERVISOR_stack_switch          3
   #define __HYPERVISOR_set_callbacks         4
   #define __HYPERVISOR_fpu_taskswitch        5
--#define __HYPERVISOR_sched_op_compat       6
--#define __HYPERVISOR_dom0_op               7
++#define __HYPERVISOR_sched_op_compat       6 /* compat since 0x00030101 */
++#define __HYPERVISOR_platform_op           7
   #define __HYPERVISOR_set_debugreg          8
   #define __HYPERVISOR_get_debugreg          9
   #define __HYPERVISOR_update_descriptor    10
@@@ -39,10 -39,10 +75,10 @@@
   #define __HYPERVISOR_multicall            13
   #define __HYPERVISOR_update_va_mapping    14
   #define __HYPERVISOR_set_timer_op         15
--#define __HYPERVISOR_event_channel_op_compat 16
++#define __HYPERVISOR_event_channel_op_compat 16 /* compat since 0x00030202 */
   #define __HYPERVISOR_xen_version          17
   #define __HYPERVISOR_console_io           18
--#define __HYPERVISOR_physdev_op_compat    19
++#define __HYPERVISOR_physdev_op_compat    19 /* compat since 0x00030202 */
   #define __HYPERVISOR_grant_table_op       20
   #define __HYPERVISOR_vm_assist            21
   #define __HYPERVISOR_update_va_mapping_otherdomain 22
@@@ -50,15 -50,14 +86,18 @@@
   #define __HYPERVISOR_vcpu_op              24
   #define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
   #define __HYPERVISOR_mmuext_op            26
--#define __HYPERVISOR_acm_op               27
++#define __HYPERVISOR_xsm_op               27
   #define __HYPERVISOR_nmi_op               28
--#define __HYPERVISOR_sched_op             29
++#define __HYPERVISOR_sched_op_new         29
   #define __HYPERVISOR_callback_op          30
   #define __HYPERVISOR_xenoprof_op          31
   #define __HYPERVISOR_event_channel_op     32
   #define __HYPERVISOR_physdev_op           33
   #define __HYPERVISOR_hvm_op               34
++#define __HYPERVISOR_sysctl               35
++#define __HYPERVISOR_domctl               36
++#define __HYPERVISOR_kexec_op             37
+ +#define __HYPERVISOR_tmem_op              38
   
   /* Architecture-specific hypercall definitions. */
   #define __HYPERVISOR_arch_0               48
@@@ -71,15 -70,15 +110,49 @@@
   #define __HYPERVISOR_arch_7               55
   
   /*
++ * HYPERCALL COMPATIBILITY.
++ */
++
++/* New sched_op hypercall introduced in 0x00030101. */
++#if __XEN_INTERFACE_VERSION__ < 0x00030101 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H))
++#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat
++#else
++#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_new
++#endif
++
++/* New event-channel and physdev hypercalls introduced in 0x00030202. */
++#if __XEN_INTERFACE_VERSION__ < 0x00030202
++#undef __HYPERVISOR_event_channel_op
++#define __HYPERVISOR_event_channel_op __HYPERVISOR_event_channel_op_compat
++#undef __HYPERVISOR_physdev_op
++#define __HYPERVISOR_physdev_op __HYPERVISOR_physdev_op_compat
++#endif
++
++/* New platform_op hypercall introduced in 0x00030204. */
++#if __XEN_INTERFACE_VERSION__ < 0x00030204 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H))
++#define __HYPERVISOR_dom0_op __HYPERVISOR_platform_op
++#endif
++
++/*
    * VIRTUAL INTERRUPTS
    *
    * Virtual interrupts that a guest OS may receive from Xen.
++ *
++ * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a
++ * global VIRQ. The former can be bound once per VCPU and cannot be re-bound.
++ * The latter can be allocated only once per guest: they must initially be
++ * allocated to VCPU0 but can subsequently be re-bound.
    */
--#define VIRQ_TIMER      0  /* Timebase update, and/or requested timeout.  */
--#define VIRQ_DEBUG      1  /* Request guest to dump debug info.           */
--#define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
--#define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
--#define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
++#define VIRQ_TIMER      0  /* V. Timebase update, and/or requested timeout.  */
++#define VIRQ_DEBUG      1  /* V. Request guest to dump debug info.           */
++#define VIRQ_CONSOLE    2  /* G. (DOM0) Bytes received on emergency console. */
++#define VIRQ_DOM_EXC    3  /* G. (DOM0) Exceptional event for some domain.   */
++#define VIRQ_TBUF       4  /* G. (DOM0) Trace buffer has records available.  */
++#define VIRQ_DEBUGGER   6  /* G. (DOM0) A domain has paused for debugging.   */
++#define VIRQ_XENOPROF   7  /* V. XenOprofile interrupt: new sample available */
++#define VIRQ_CON_RING   8  /* G. (DOM0) Bytes received on console            */
++#define VIRQ_PCPU_STATE 9  /* G. (DOM0) PCPU state changed                   */
++#define VIRQ_MEM_EVENT  10 /* G. (DOM0) A memory event has occured           */
   
   /* Architecture-specific VIRQ definitions. */
   #define VIRQ_ARCH_0    16
@@@ -92,19 -91,19 +165,28 @@@
   #define VIRQ_ARCH_7    23
   
   #define NR_VIRQS       24
++
   /*
-- * MMU-UPDATE REQUESTS
++ * HYPERVISOR_mmu_update(reqs, count, pdone, foreigndom)
    *
-- * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
-- * A foreigndom (FD) can be specified (or DOMID_SELF for none).
-- * Where the FD has some effect, it is described below.
-- * ptr[1:0] specifies the appropriate MMU_* command.
++ * @reqs is an array of mmu_update_t structures ((ptr, val) pairs).
++ * @count is the length of the above array.
++ * @pdone is an output parameter indicating number of completed operations
++ * @foreigndom[15:0]: FD, the expected owner of data pages referenced in this
++ *                    hypercall invocation. Can be DOMID_SELF.
++ * @foreigndom[31:16]: PFD, the expected owner of pagetable pages referenced
++ *                     in this hypercall invocation. The value of this field
++ *                     (x) encodes the PFD as follows:
++ *                     x == 0 => PFD == DOMID_SELF
++ *                     x != 0 => PFD == x - 1
    *
++ * Sub-commands: ptr[1:0] specifies the appropriate MMU_* command.
++ * -------------
    * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
-- * Updates an entry in a page table. If updating an L1 table, and the new
-- * table entry is valid/present, the mapped frame must belong to the FD, if
-- * an FD has been specified. If attempting to map an I/O page then the
-- * caller assumes the privilege of the FD.
++ * Updates an entry in a page table belonging to PFD. If updating an L1 table,
++ * and the new table entry is valid/present, the mapped frame must belong to
++ * FD. If attempting to map an I/O page then the caller assumes the privilege
++ * of the FD.
    * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
    * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
    * ptr[:2]  -- Machine address of the page-table entry to modify.
@@@ -120,8 -119,8 +202,8 @@@
    * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed
    * with those in @val.
    */
--#define MMU_NORMAL_PT_UPDATE      0 /* checked '*ptr = val'. ptr is MA.       */
--#define MMU_MACHPHYS_UPDATE       1 /* ptr = MA of frame to modify entry for  */
++#define MMU_NORMAL_PT_UPDATE      0 /* checked '*ptr = val'. ptr is MA.      */
++#define MMU_MACHPHYS_UPDATE       1 /* ptr = MA of frame to modify entry for */
   #define MMU_PT_UPDATE_PRESERVE_AD 2 /* atomically: *ptr = val | (*ptr&(A|D)) */
   
   /*
@@@ -164,9 -163,9 +246,23 @@@
    * cmd: MMUEXT_FLUSH_CACHE
    * No additional arguments. Writes back and flushes cache contents.
    *
++ * cmd: MMUEXT_FLUSH_CACHE_GLOBAL
++ * No additional arguments. Writes back and flushes cache contents
++ * on all CPUs in the system.
++ *
    * cmd: MMUEXT_SET_LDT
    * linear_addr: Linear address of LDT base (NB. must be page-aligned).
    * nr_ents: Number of entries in LDT.
++ *
++ * cmd: MMUEXT_CLEAR_PAGE
++ * mfn: Machine frame number to be cleared.
++ *
++ * cmd: MMUEXT_COPY_PAGE
++ * mfn: Machine frame number of the destination page.
++ * src_mfn: Machine frame number of the source page.
++ *
++ * cmd: MMUEXT_[UN]MARK_SUPER
++ * mfn: Machine frame number of head of superpage to be [un]marked.
    */
   #define MMUEXT_PIN_L1_TABLE      0
   #define MMUEXT_PIN_L2_TABLE      1
@@@ -183,24 -182,24 +279,38 @@@
   #define MMUEXT_FLUSH_CACHE      12
   #define MMUEXT_SET_LDT          13
   #define MMUEXT_NEW_USER_BASEPTR 15
++#define MMUEXT_CLEAR_PAGE       16
++#define MMUEXT_COPY_PAGE        17
++#define MMUEXT_FLUSH_CACHE_GLOBAL 18
++#define MMUEXT_MARK_SUPER       19
++#define MMUEXT_UNMARK_SUPER     20
   
   #ifndef __ASSEMBLY__
   struct mmuext_op {
--      unsigned int cmd;
--      union {
--              /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
--              unsigned long mfn;
--              /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
--              unsigned long linear_addr;
--      } arg1;
--      union {
--              /* SET_LDT */
--              unsigned int nr_ents;
--              /* TLB_FLUSH_MULTI, INVLPG_MULTI */
--              void *vcpumask;
--      } arg2;
++    unsigned int cmd;
++    union {
++        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
++         * CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */
++        xen_pfn_t     mfn;
++        /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
++        unsigned long linear_addr;
++    } arg1;
++    union {
++        /* SET_LDT */
++        unsigned int nr_ents;
++        /* TLB_FLUSH_MULTI, INVLPG_MULTI */
++#if __XEN_INTERFACE_VERSION__ >= 0x00030205
++        XEN_GUEST_HANDLE(const_void) vcpumask;
++#else
++        const void *vcpumask;
++#endif
++        /* COPY_PAGE */
++        xen_pfn_t src_mfn;
++    } arg2;
   };
   DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
++typedef struct mmuext_op mmuext_op_t;
++DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
   #endif
   
   /* These are passed as 'flags' to update_va_mapping. They can be ORed. */
@@@ -225,11 -224,11 +335,24 @@@
    */
   #define VMASST_CMD_enable                0
   #define VMASST_CMD_disable               1
++
++/* x86/32 guests: simulate full 4GB segment limits. */
   #define VMASST_TYPE_4gb_segments         0
++
++/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
   #define VMASST_TYPE_4gb_segments_notify  1
++
++/*
++ * x86 guests: support writes to bottom-level PTEs.
++ * NB1. Page-directory entries cannot be written.
++ * NB2. Guest must continue to remove all writable mappings of PTEs.
++ */
   #define VMASST_TYPE_writable_pagetables  2
++
++/* x86/PAE guests: support PDPTs above 4GB. */
   #define VMASST_TYPE_pae_extended_cr3     3
--#define MAX_VMASST_TYPE 3
++
++#define MAX_VMASST_TYPE                  3
   
   #ifndef __ASSEMBLY__
   
@@@ -261,6 -260,6 +384,16 @@@ typedef uint16_t domid_t
   #define DOMID_XEN  (0x7FF2U)
   
   /*
++ * DOMID_COW is used as the owner of sharable pages */
++#define DOMID_COW  (0x7FF3U)
++
++/* DOMID_INVALID is used to identify pages with unknown owner. */
++#define DOMID_INVALID (0x7FF4U)
++
++/* Idle domain. */
++#define DOMID_IDLE (0x7FFFU)
++
++/*
    * Send an array of these to HYPERVISOR_mmu_update().
    * NB. The fields are natural pointer/address size for this architecture.
    */
@@@ -269,6 -268,6 +402,8 @@@ struct mmu_update 
       uint64_t val;       /* New contents of PTE.    */
   };
   DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
++typedef struct mmu_update mmu_update_t;
++DEFINE_XEN_GUEST_HANDLE(mmu_update_t);
   
   /*
    * Send an array of these to HYPERVISOR_multicall().
@@@ -276,10 -275,10 +411,16 @@@
    */
   struct multicall_entry {
       unsigned long op;
++#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
++    unsigned long result;
++#else
       long result;
++#endif
       unsigned long args[6];
   };
   DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
++typedef struct multicall_entry multicall_entry_t;
++DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
   
   /*
    * Event channel endpoints per domain:
@@@ -288,173 -287,173 +429,274 @@@
   #define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
   
   struct vcpu_time_info {
--      /*
--       * Updates to the following values are preceded and followed
--       * by an increment of 'version'. The guest can therefore
--       * detect updates by looking for changes to 'version'. If the
--       * least-significant bit of the version number is set then an
--       * update is in progress and the guest must wait to read a
--       * consistent set of values.  The correct way to interact with
--       * the version number is similar to Linux's seqlock: see the
--       * implementations of read_seqbegin/read_seqretry.
--       */
--      uint32_t version;
--      uint32_t pad0;
--      uint64_t tsc_timestamp;   /* TSC at last update of time vals.  */
--      uint64_t system_time;     /* Time, in nanosecs, since boot.    */
--      /*
--       * Current system time:
--       *   system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul
--       * CPU frequency (Hz):
--       *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
--       */
--      uint32_t tsc_to_system_mul;
--      int8_t   tsc_shift;
--      int8_t   pad1[3];
++    /*
++     * Updates to the following values are preceded and followed by an
++     * increment of 'version'. The guest can therefore detect updates by
++     * looking for changes to 'version'. If the least-significant bit of
++     * the version number is set then an update is in progress and the guest
++     * must wait to read a consistent set of values.
++     * The correct way to interact with the version number is similar to
++     * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry.
++     */
++    uint32_t version;
++    uint32_t pad0;
++    uint64_t tsc_timestamp;   /* TSC at last update of time vals.  */
++    uint64_t system_time;     /* Time, in nanosecs, since boot.    */
++    /*
++     * Current system time:
++     *   system_time +
++     *   ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32)
++     * CPU frequency (Hz):
++     *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
++     */
++    uint32_t tsc_to_system_mul;
++    int8_t   tsc_shift;
++    int8_t   pad1[3];
   }; /* 32 bytes */
++typedef struct vcpu_time_info vcpu_time_info_t;
   
   struct vcpu_info {
--      /*
--       * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
--       * a pending notification for a particular VCPU. It is then cleared
--       * by the guest OS /before/ checking for pending work, thus avoiding
--       * a set-and-check race. Note that the mask is only accessed by Xen
--       * on the CPU that is currently hosting the VCPU. This means that the
--       * pending and mask flags can be updated by the guest without special
--       * synchronisation (i.e., no need for the x86 LOCK prefix).
--       * This may seem suboptimal because if the pending flag is set by
--       * a different CPU then an IPI may be scheduled even when the mask
--       * is set. However, note:
--       *  1. The task of 'interrupt holdoff' is covered by the per-event-
--       *     channel mask bits. A 'noisy' event that is continually being
--       *     triggered can be masked at source at this very precise
--       *     granularity.
--       *  2. The main purpose of the per-VCPU mask is therefore to restrict
--       *     reentrant execution: whether for concurrency control, or to
--       *     prevent unbounded stack usage. Whatever the purpose, we expect
--       *     that the mask will be asserted only for short periods at a time,
--       *     and so the likelihood of a 'spurious' IPI is suitably small.
--       * The mask is read before making an event upcall to the guest: a
--       * non-zero mask therefore guarantees that the VCPU will not receive
--       * an upcall activation. The mask is cleared when the VCPU requests
--       * to block: this avoids wakeup-waiting races.
--       */
--      uint8_t evtchn_upcall_pending;
--      uint8_t evtchn_upcall_mask;
--      unsigned long evtchn_pending_sel;
--      struct arch_vcpu_info arch;
--      struct pvclock_vcpu_time_info time;
++    /*
++     * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
++     * a pending notification for a particular VCPU. It is then cleared
++     * by the guest OS /before/ checking for pending work, thus avoiding
++     * a set-and-check race. Note that the mask is only accessed by Xen
++     * on the CPU that is currently hosting the VCPU. This means that the
++     * pending and mask flags can be updated by the guest without special
++     * synchronisation (i.e., no need for the x86 LOCK prefix).
++     * This may seem suboptimal because if the pending flag is set by
++     * a different CPU then an IPI may be scheduled even when the mask
++     * is set. However, note:
++     *  1. The task of 'interrupt holdoff' is covered by the per-event-
++     *     channel mask bits. A 'noisy' event that is continually being
++     *     triggered can be masked at source at this very precise
++     *     granularity.
++     *  2. The main purpose of the per-VCPU mask is therefore to restrict
++     *     reentrant execution: whether for concurrency control, or to
++     *     prevent unbounded stack usage. Whatever the purpose, we expect
++     *     that the mask will be asserted only for short periods at a time,
++     *     and so the likelihood of a 'spurious' IPI is suitably small.
++     * The mask is read before making an event upcall to the guest: a
++     * non-zero mask therefore guarantees that the VCPU will not receive
++     * an upcall activation. The mask is cleared when the VCPU requests
++     * to block: this avoids wakeup-waiting races.
++     */
++    uint8_t evtchn_upcall_pending;
++    uint8_t evtchn_upcall_mask;
++    unsigned long evtchn_pending_sel;
++    struct arch_vcpu_info arch;
++#ifdef CONFIG_PARAVIRT_XEN
++    struct pvclock_vcpu_time_info time;
++#else
++    struct vcpu_time_info time;
++#endif
   }; /* 64 bytes (x86) */
++#ifndef __XEN__
++typedef struct vcpu_info vcpu_info_t;
++#endif
   
   /*
    * Xen/kernel shared data -- pointer provided in start_info.
-- * NB. We expect that this struct is smaller than a page.
++ *
++ * This structure is defined to be both smaller than a page, and the
++ * only data on the shared page, but may vary in actual size even within
++ * compatible Xen versions; guests should not rely on the size
++ * of this structure remaining constant.
    */
   struct shared_info {
--      struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
--
--      /*
--       * A domain can create "event channels" on which it can send and receive
--       * asynchronous event notifications. There are three classes of event that
--       * are delivered by this mechanism:
--       *  1. Bi-directional inter- and intra-domain connections. Domains must
--       *     arrange out-of-band to set up a connection (usually by allocating
--       *     an unbound 'listener' port and avertising that via a storage service
--       *     such as xenstore).
--       *  2. Physical interrupts. A domain with suitable hardware-access
--       *     privileges can bind an event-channel port to a physical interrupt
--       *     source.
--       *  3. Virtual interrupts ('events'). A domain can bind an event-channel
--       *     port to a virtual interrupt source, such as the virtual-timer
--       *     device or the emergency console.
--       *
--       * Event channels are addressed by a "port index". Each channel is
--       * associated with two bits of information:
--       *  1. PENDING -- notifies the domain that there is a pending notification
--       *     to be processed. This bit is cleared by the guest.
--       *  2. MASK -- if this bit is clear then a 0->1 transition of PENDING
--       *     will cause an asynchronous upcall to be scheduled. This bit is only
--       *     updated by the guest. It is read-only within Xen. If a channel
--       *     becomes pending while the channel is masked then the 'edge' is lost
--       *     (i.e., when the channel is unmasked, the guest must manually handle
--       *     pending notifications as no upcall will be scheduled by Xen).
--       *
--       * To expedite scanning of pending notifications, any 0->1 pending
--       * transition on an unmasked channel causes a corresponding bit in a
--       * per-vcpu selector word to be set. Each bit in the selector covers a
--       * 'C long' in the PENDING bitfield array.
--       */
--      unsigned long evtchn_pending[sizeof(unsigned long) * 8];
--      unsigned long evtchn_mask[sizeof(unsigned long) * 8];
--
--      /*
--       * Wallclock time: updated only by control software. Guests should base
--       * their gettimeofday() syscall on this wallclock-base value.
--       */
--      struct pvclock_wall_clock wc;
--
--      struct arch_shared_info arch;
++    struct vcpu_info vcpu_info[XEN_LEGACY_MAX_VCPUS];
++
++    /*
++     * A domain can create "event channels" on which it can send and receive
++     * asynchronous event notifications. There are three classes of event that
++     * are delivered by this mechanism:
++     *  1. Bi-directional inter- and intra-domain connections. Domains must
++     *     arrange out-of-band to set up a connection (usually by allocating
++     *     an unbound 'listener' port and avertising that via a storage service
++     *     such as xenstore).
++     *  2. Physical interrupts. A domain with suitable hardware-access
++     *     privileges can bind an event-channel port to a physical interrupt
++     *     source.
++     *  3. Virtual interrupts ('events'). A domain can bind an event-channel
++     *     port to a virtual interrupt source, such as the virtual-timer
++     *     device or the emergency console.
++     *
++     * Event channels are addressed by a "port index". Each channel is
++     * associated with two bits of information:
++     *  1. PENDING -- notifies the domain that there is a pending notification
++     *     to be processed. This bit is cleared by the guest.
++     *  2. MASK -- if this bit is clear then a 0->1 transition of PENDING
++     *     will cause an asynchronous upcall to be scheduled. This bit is only
++     *     updated by the guest. It is read-only within Xen. If a channel
++     *     becomes pending while the channel is masked then the 'edge' is lost
++     *     (i.e., when the channel is unmasked, the guest must manually handle
++     *     pending notifications as no upcall will be scheduled by Xen).
++     *
++     * To expedite scanning of pending notifications, any 0->1 pending
++     * transition on an unmasked channel causes a corresponding bit in a
++     * per-vcpu selector word to be set. Each bit in the selector covers a
++     * 'C long' in the PENDING bitfield array.
++     */
++    unsigned long evtchn_pending[sizeof(unsigned long) * 8];
++    unsigned long evtchn_mask[sizeof(unsigned long) * 8];
++
++    /*
++     * Wallclock time: updated only by control software. Guests should base
++     * their gettimeofday() syscall on this wallclock-base value.
++     */
++#ifdef CONFIG_PARAVIRT_XEN
++    struct pvclock_wall_clock wc;
++#else
++    uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
++    uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
++    uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
++#endif
++
++    struct arch_shared_info arch;
   
   };
++#ifndef __XEN__
++typedef struct shared_info shared_info_t;
++#endif
   
   /*
-- * Start-of-day memory layout for the initial domain (DOM0):
++ * Start-of-day memory layout:
    *  1. The domain is started within contiguous virtual-memory region.
-- *  2. The contiguous region begins and ends on an aligned 4MB boundary.
-- *  3. The region start corresponds to the load address of the OS image.
-- *     If the load address is not 4MB aligned then the address is rounded down.
-- *  4. This the order of bootstrap elements in the initial virtual region:
++ *  2. The contiguous region ends on an aligned 4MB boundary.
++ *  3. This the order of bootstrap elements in the initial virtual region:
    *      a. relocated kernel image
    *      b. initial ram disk              [mod_start, mod_len]
    *      c. list of allocated page frames [mfn_list, nr_pages]
++ *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
    *      d. start_info_t structure        [register ESI (x86)]
    *      e. bootstrap page tables         [pt_base, CR3 (x86)]
    *      f. bootstrap stack               [register ESP (x86)]
-- *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
-- *  6. The initial ram disk may be omitted.
-- *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
++ *  4. Bootstrap elements are packed together, but each is 4kB-aligned.
++ *  5. The initial ram disk may be omitted.
++ *  6. The list of page frames forms a contiguous 'pseudo-physical' memory
    *     layout for the domain. In particular, the bootstrap virtual-memory
    *     region is a 1:1 mapping to the first section of the pseudo-physical map.
-- *  8. All bootstrap elements are mapped read-writable for the guest OS. The
++ *  7. All bootstrap elements are mapped read-writable for the guest OS. The
    *     only exception is the bootstrap page table, which is mapped read-only.
-- *  9. There is guaranteed to be at least 512kB padding after the final
++ *  8. There is guaranteed to be at least 512kB padding after the final
    *     bootstrap element. If necessary, the bootstrap virtual region is
    *     extended by an extra 4MB to ensure this.
    */
   
   #define MAX_GUEST_CMDLINE 1024
   struct start_info {
--      /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME.    */
--      char magic[32];             /* "xen-<version>-<platform>".            */
--      unsigned long nr_pages;     /* Total pages allocated to this domain.  */
--      unsigned long shared_info;  /* MACHINE address of shared info struct. */
--      uint32_t flags;             /* SIF_xxx flags.                         */
--      unsigned long store_mfn;    /* MACHINE page number of shared page.    */
--      uint32_t store_evtchn;      /* Event channel for store communication. */
--      union {
--              struct {
--                      unsigned long mfn;  /* MACHINE page number of console page.   */
--                      uint32_t  evtchn;   /* Event channel for console page.        */
--              } domU;
--              struct {
--                      uint32_t info_off;  /* Offset of console_info struct.         */
--                      uint32_t info_size; /* Size of console_info struct from start.*/
--              } dom0;
--      } console;
--      /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME).     */
--      unsigned long pt_base;      /* VIRTUAL address of page directory.     */
--      unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames.       */
--      unsigned long mfn_list;     /* VIRTUAL address of page-frame list.    */
--      unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
--      unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
--      int8_t cmd_line[MAX_GUEST_CMDLINE];
++    /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME.    */
++    char magic[32];             /* "xen-<version>-<platform>".            */
++    unsigned long nr_pages;     /* Total pages allocated to this domain.  */
++    unsigned long shared_info;  /* MACHINE address of shared info struct. */
++    uint32_t flags;             /* SIF_xxx flags.                         */
++    xen_pfn_t store_mfn;        /* MACHINE page number of shared page.    */
++    uint32_t store_evtchn;      /* Event channel for store communication. */
++    union {
++        struct {
++            xen_pfn_t mfn;      /* MACHINE page number of console page.   */
++            uint32_t  evtchn;   /* Event channel for console page.        */
++        } domU;
++        struct {
++            uint32_t info_off;  /* Offset of console_info struct.         */
++            uint32_t info_size; /* Size of console_info struct from start.*/
++        } dom0;
++    } console;
++    /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME).     */
++    unsigned long pt_base;      /* VIRTUAL address of page directory.     */
++    unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames.       */
++    unsigned long mfn_list;     /* VIRTUAL address of page-frame list.    */
++    unsigned long mod_start;    /* VIRTUAL address of pre-loaded module   */
++                                /* (PFN of pre-loaded module if           */
++                                /*  SIF_MOD_START_PFN set in flags).      */
++    unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
++    int8_t cmd_line[MAX_GUEST_CMDLINE];
++    /* The pfn range here covers both page table and p->m table frames.   */
++    unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
++    unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
   };
++typedef struct start_info start_info_t;
++
++/* New console union for dom0 introduced in 0x00030203. */
++#if __XEN_INTERFACE_VERSION__ < 0x00030203
++#define console_mfn    console.domU.mfn
++#define console_evtchn console.domU.evtchn
++#endif
   
   /* These flags are passed in the 'flags' field of start_info_t. */
   #define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
   #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
++#define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
++#define SIF_MOD_START_PFN (1<<3)  /* Is mod_start a PFN? */
++#define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
   
--typedef uint64_t cpumap_t;
++/*
++ * A multiboot module is a package containing modules very similar to a
++ * multiboot module array. The only differences are:
++ * - the array of module descriptors is by convention simply at the beginning
++ *   of the multiboot module,
++ * - addresses in the module descriptors are based on the beginning of the
++ *   multiboot module,
++ * - the number of modules is determined by a termination descriptor that has
++ *   mod_start == 0.
++ *
++ * This permits to both build it statically and reference it in a configuration
++ * file, and let the PV guest easily rebase the addresses to virtual addresses
++ * and at the same time count the number of modules.
++ */
++struct xen_multiboot_mod_list
++{
++    /* Address of first byte of the module */
++    uint32_t mod_start;
++    /* Address of last byte of the module (inclusive) */
++    uint32_t mod_end;
++    /* Address of zero-terminated command line */
++    uint32_t cmdline;
++    /* Unused, must be zero */
++    uint32_t pad;
++};
++
++typedef struct dom0_vga_console_info {
++    uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */
++#define XEN_VGATYPE_TEXT_MODE_3 0x03
++#define XEN_VGATYPE_VESA_LFB    0x23
++
++    union {
++        struct {
++            /* Font height, in pixels. */
++            uint16_t font_height;
++            /* Cursor location (column, row). */
++            uint16_t cursor_x, cursor_y;
++            /* Number of rows and columns (dimensions in characters). */
++            uint16_t rows, columns;
++        } text_mode_3;
++
++        struct {
++            /* Width and height, in pixels. */
++            uint16_t width, height;
++            /* Bytes per scan line. */
++            uint16_t bytes_per_line;
++            /* Bits per pixel. */
++            uint16_t bits_per_pixel;
++            /* LFB physical address, and size (in units of 64kB). */
++            uint32_t lfb_base;
++            uint32_t lfb_size;
++            /* RGB mask offsets and sizes, as defined by VBE 1.2+ */
++            uint8_t  red_pos, red_size;
++            uint8_t  green_pos, green_size;
++            uint8_t  blue_pos, blue_size;
++            uint8_t  rsvd_pos, rsvd_size;
++#if __XEN_INTERFACE_VERSION__ >= 0x00030206
++            /* VESA capabilities (offset 0xa, VESA command 0x4f00). */
++            uint32_t gbl_caps;
++            /* Mode attributes (offset 0x0, VESA command 0x4f01). */
++            uint16_t mode_attrs;
++#endif
++        } vesa_lfb;
++    } u;
++} dom0_vga_console_info_t;
++#define xen_vga_console_info dom0_vga_console_info
++#define xen_vga_console_info_t dom0_vga_console_info_t
   
   typedef uint8_t xen_domain_handle_t[16];
   
@@@ -462,27 -461,6 +704,11 @@@
   #define __mk_unsigned_long(x) x ## UL
   #define mk_unsigned_long(x) __mk_unsigned_long(x)
   
- #define TMEM_SPEC_VERSION 1
- 
- struct tmem_op {
-       uint32_t cmd;
-       int32_t pool_id;
-       union {
-               struct {  /* for cmd == TMEM_NEW_POOL */
-                       uint64_t uuid[2];
-                       uint32_t flags;
-               } new;
-               struct {
-                       uint64_t oid[3];
-                       uint32_t index;
-                       uint32_t tmem_offset;
-                       uint32_t pfn_offset;
-                       uint32_t len;
-                       GUEST_HANDLE(void) gmfn; /* guest machine page frame */
-               } gen;
-       } u;
- };
++__DEFINE_XEN_GUEST_HANDLE(uint8,  uint8_t);
++__DEFINE_XEN_GUEST_HANDLE(uint16, uint16_t);
++__DEFINE_XEN_GUEST_HANDLE(uint32, uint32_t);
++__DEFINE_XEN_GUEST_HANDLE(uint64, uint64_t);
+ +
   #else /* __ASSEMBLY__ */
   
   /* In assembly code we cannot use C numeric constant suffixes. */
@@@ -490,4 -468,4 +716,23 @@@
   
   #endif /* !__ASSEMBLY__ */
   
++/* Default definitions for macros used by domctl/sysctl. */
++#if defined(__XEN__) || defined(__XEN_TOOLS__)
++
++#ifndef uint64_aligned_t
++#define uint64_aligned_t uint64_t
++#endif
++#ifndef XEN_GUEST_HANDLE_64
++#define XEN_GUEST_HANDLE_64(name) XEN_GUEST_HANDLE(name)
++#endif
++
++#ifndef __ASSEMBLY__
++struct xenctl_cpumap {
++    XEN_GUEST_HANDLE_64(uint8) bitmap;
++    uint32_t nr_cpus;
++};
++#endif
++
++#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
++
   #endif /* __XEN_PUBLIC_XEN_H__ */
diff --cc include/xen/interface/xenoprof.h

index 0000000,0000000..346d6c5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/xenoprof.h
@@@ -1,0 -1,0 +1,152 @@@
++/******************************************************************************
++ * xenoprof.h
++ * 
++ * Interface for enabling system wide profiling based on hardware performance
++ * counters
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (C) 2005 Hewlett-Packard Co.
++ * Written by Aravind Menon & Jose Renato Santos
++ */
++
++#ifndef __XEN_PUBLIC_XENOPROF_H__
++#define __XEN_PUBLIC_XENOPROF_H__
++
++#include "xen.h"
++
++/*
++ * Commands to HYPERVISOR_xenoprof_op().
++ */
++#define XENOPROF_init                0
++#define XENOPROF_reset_active_list   1
++#define XENOPROF_reset_passive_list  2
++#define XENOPROF_set_active          3
++#define XENOPROF_set_passive         4
++#define XENOPROF_reserve_counters    5
++#define XENOPROF_counter             6
++#define XENOPROF_setup_events        7
++#define XENOPROF_enable_virq         8
++#define XENOPROF_start               9
++#define XENOPROF_stop               10
++#define XENOPROF_disable_virq       11
++#define XENOPROF_release_counters   12
++#define XENOPROF_shutdown           13
++#define XENOPROF_get_buffer         14
++#define XENOPROF_set_backtrace      15
++
++/* AMD IBS support */
++#define XENOPROF_get_ibs_caps       16
++#define XENOPROF_ibs_counter        17
++#define XENOPROF_last_op            17
++
++#define MAX_OPROF_EVENTS    32
++#define MAX_OPROF_DOMAINS   25
++#define XENOPROF_CPU_TYPE_SIZE 64
++
++/* Xenoprof performance events (not Xen events) */
++struct event_log {
++    uint64_t eip;
++    uint8_t mode;
++    uint8_t event;
++};
++
++/* PC value that indicates a special code */
++#define XENOPROF_ESCAPE_CODE ~0UL
++/* Transient events for the xenoprof->oprofile cpu buf */
++#define XENOPROF_TRACE_BEGIN 1
++
++/* Xenoprof buffer shared between Xen and domain - 1 per VCPU */
++struct xenoprof_buf {
++    uint32_t event_head;
++    uint32_t event_tail;
++    uint32_t event_size;
++    uint32_t vcpu_id;
++    uint64_t xen_samples;
++    uint64_t kernel_samples;
++    uint64_t user_samples;
++    uint64_t lost_samples;
++    struct event_log event_log[1];
++};
++#ifndef __XEN__
++typedef struct xenoprof_buf xenoprof_buf_t;
++DEFINE_XEN_GUEST_HANDLE(xenoprof_buf_t);
++#endif
++
++struct xenoprof_init {
++    int32_t  num_events;
++    int32_t  is_primary;
++    char cpu_type[XENOPROF_CPU_TYPE_SIZE];
++};
++typedef struct xenoprof_init xenoprof_init_t;
++DEFINE_XEN_GUEST_HANDLE(xenoprof_init_t);
++
++struct xenoprof_get_buffer {
++    int32_t  max_samples;
++    int32_t  nbuf;
++    int32_t  bufsize;
++    uint64_t buf_gmaddr;
++};
++typedef struct xenoprof_get_buffer xenoprof_get_buffer_t;
++DEFINE_XEN_GUEST_HANDLE(xenoprof_get_buffer_t);
++
++struct xenoprof_counter {
++    uint32_t ind;
++    uint64_t count;
++    uint32_t enabled;
++    uint32_t event;
++    uint32_t hypervisor;
++    uint32_t kernel;
++    uint32_t user;
++    uint64_t unit_mask;
++};
++typedef struct xenoprof_counter xenoprof_counter_t;
++DEFINE_XEN_GUEST_HANDLE(xenoprof_counter_t);
++
++typedef struct xenoprof_passive {
++    uint16_t domain_id;
++    int32_t  max_samples;
++    int32_t  nbuf;
++    int32_t  bufsize;
++    uint64_t buf_gmaddr;
++} xenoprof_passive_t;
++DEFINE_XEN_GUEST_HANDLE(xenoprof_passive_t);
++
++struct xenoprof_ibs_counter {
++    uint64_t op_enabled;
++    uint64_t fetch_enabled;
++    uint64_t max_cnt_fetch;
++    uint64_t max_cnt_op;
++    uint64_t rand_en;
++    uint64_t dispatched_ops;
++};
++typedef struct xenoprof_ibs_counter xenoprof_ibs_counter_t;
++DEFINE_XEN_GUEST_HANDLE(xenoprof_ibs_counter_t);
++
++#endif /* __XEN_PUBLIC_XENOPROF_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/xsm/acm.h

index 0000000,0000000..b6ac8d5

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/xsm/acm.h
@@@ -1,0 -1,0 +1,223 @@@
++/*
++ * acm.h: Xen access control module interface defintions
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Reiner Sailer <sailer@watson.ibm.com>
++ * Copyright (c) 2005, International Business Machines Corporation.
++ */
++
++#ifndef _XEN_PUBLIC_ACM_H
++#define _XEN_PUBLIC_ACM_H
++
++#include "../xen.h"
++
++/* default ssid reference value if not supplied */
++#define ACM_DEFAULT_SSID  0x0
++#define ACM_DEFAULT_LOCAL_SSID  0x0
++
++/* Internal ACM ERROR types */
++#define ACM_OK     0
++#define ACM_UNDEF   -1
++#define ACM_INIT_SSID_ERROR  -2
++#define ACM_INIT_SOID_ERROR  -3
++#define ACM_ERROR          -4
++
++/* External ACCESS DECISIONS */
++#define ACM_ACCESS_PERMITTED        0
++#define ACM_ACCESS_DENIED           -111
++#define ACM_NULL_POINTER_ERROR      -200
++
++/*
++   Error codes reported in when trying to test for a new policy
++   These error codes are reported in an array of tuples where
++   each error code is followed by a parameter describing the error
++   more closely, such as a domain id.
++*/
++#define ACM_EVTCHN_SHARING_VIOLATION       0x100
++#define ACM_GNTTAB_SHARING_VIOLATION       0x101
++#define ACM_DOMAIN_LOOKUP                  0x102
++#define ACM_CHWALL_CONFLICT                0x103
++#define ACM_SSIDREF_IN_USE                 0x104
++
++
++/* primary policy in lower 4 bits */
++#define ACM_NULL_POLICY 0
++#define ACM_CHINESE_WALL_POLICY 1
++#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
++#define ACM_POLICY_UNDEFINED 15
++
++/* combinations have secondary policy component in higher 4bit */
++#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \
++    ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY)
++
++/* policy: */
++#define ACM_POLICY_NAME(X) \
++ ((X) == (ACM_NULL_POLICY)) ? "NULL" :                        \
++    ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL" :        \
++    ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT" : \
++    ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT" : \
++     "UNDEFINED"
++
++/* the following policy versions must be increased
++ * whenever the interpretation of the related
++ * policy's data structure changes
++ */
++#define ACM_POLICY_VERSION 4
++#define ACM_CHWALL_VERSION 1
++#define ACM_STE_VERSION  1
++
++/* defines a ssid reference used by xen */
++typedef uint32_t ssidref_t;
++
++/* hooks that are known to domains */
++#define ACMHOOK_none          0
++#define ACMHOOK_sharing       1
++#define ACMHOOK_authorization 2
++#define ACMHOOK_conflictset   3
++
++/* -------security policy relevant type definitions-------- */
++
++/* type identifier; compares to "equal" or "not equal" */
++typedef uint16_t domaintype_t;
++
++/* CHINESE WALL POLICY DATA STRUCTURES
++ *
++ * current accumulated conflict type set:
++ * When a domain is started and has a type that is in
++ * a conflict set, the conflicting types are incremented in
++ * the aggregate set. When a domain is destroyed, the 
++ * conflicting types to its type are decremented.
++ * If a domain has multiple types, this procedure works over
++ * all those types.
++ *
++ * conflict_aggregate_set[i] holds the number of
++ *   running domains that have a conflict with type i.
++ *
++ * running_types[i] holds the number of running domains
++ *        that include type i in their ssidref-referenced type set
++ *
++ * conflict_sets[i][j] is "0" if type j has no conflict
++ *    with type i and is "1" otherwise.
++ */
++/* high-16 = version, low-16 = check magic */
++#define ACM_MAGIC  0x0001debc
++
++/* size of the SHA1 hash identifying the XML policy from which the
++   binary policy was created */
++#define ACM_SHA1_HASH_SIZE    20
++
++/* each offset in bytes from start of the struct they
++ * are part of */
++
++/* V3 of the policy buffer aded a version structure */
++struct acm_policy_version
++{
++    uint32_t major;
++    uint32_t minor;
++};
++
++
++/* each buffer consists of all policy information for
++ * the respective policy given in the policy code
++ *
++ * acm_policy_buffer, acm_chwall_policy_buffer,
++ * and acm_ste_policy_buffer need to stay 32-bit aligned
++ * because we create binary policies also with external
++ * tools that assume packed representations (e.g. the java tool)
++ */
++struct acm_policy_buffer {
++    uint32_t magic;
++    uint32_t policy_version; /* ACM_POLICY_VERSION */
++    uint32_t len;
++    uint32_t policy_reference_offset;
++    uint32_t primary_policy_code;
++    uint32_t primary_buffer_offset;
++    uint32_t secondary_policy_code;
++    uint32_t secondary_buffer_offset;
++    struct acm_policy_version xml_pol_version; /* add in V3 */
++    uint8_t xml_policy_hash[ACM_SHA1_HASH_SIZE]; /* added in V4 */
++};
++
++
++struct acm_policy_reference_buffer {
++    uint32_t len;
++};
++
++struct acm_chwall_policy_buffer {
++    uint32_t policy_version; /* ACM_CHWALL_VERSION */
++    uint32_t policy_code;
++    uint32_t chwall_max_types;
++    uint32_t chwall_max_ssidrefs;
++    uint32_t chwall_max_conflictsets;
++    uint32_t chwall_ssid_offset;
++    uint32_t chwall_conflict_sets_offset;
++    uint32_t chwall_running_types_offset;
++    uint32_t chwall_conflict_aggregate_offset;
++};
++
++struct acm_ste_policy_buffer {
++    uint32_t policy_version; /* ACM_STE_VERSION */
++    uint32_t policy_code;
++    uint32_t ste_max_types;
++    uint32_t ste_max_ssidrefs;
++    uint32_t ste_ssid_offset;
++};
++
++struct acm_stats_buffer {
++    uint32_t magic;
++    uint32_t len;
++    uint32_t primary_policy_code;
++    uint32_t primary_stats_offset;
++    uint32_t secondary_policy_code;
++    uint32_t secondary_stats_offset;
++};
++
++struct acm_ste_stats_buffer {
++    uint32_t ec_eval_count;
++    uint32_t gt_eval_count;
++    uint32_t ec_denied_count;
++    uint32_t gt_denied_count;
++    uint32_t ec_cachehit_count;
++    uint32_t gt_cachehit_count;
++};
++
++struct acm_ssid_buffer {
++    uint32_t len;
++    ssidref_t ssidref;
++    uint32_t policy_reference_offset;
++    uint32_t primary_policy_code;
++    uint32_t primary_max_types;
++    uint32_t primary_types_offset;
++    uint32_t secondary_policy_code;
++    uint32_t secondary_max_types;
++    uint32_t secondary_types_offset;
++};
++
++#endif
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/xsm/acm_ops.h

index 0000000,0000000..1fef7a0

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/xsm/acm_ops.h
@@@ -1,0 -1,0 +1,159 @@@
++/*
++ * acm_ops.h: Xen access control module hypervisor commands
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Reiner Sailer <sailer@watson.ibm.com>
++ * Copyright (c) 2005,2006 International Business Machines Corporation.
++ */
++
++#ifndef __XEN_PUBLIC_ACM_OPS_H__
++#define __XEN_PUBLIC_ACM_OPS_H__
++
++#include "../xen.h"
++#include "acm.h"
++
++/*
++ * Make sure you increment the interface version whenever you modify this file!
++ * This makes sure that old versions of acm tools will stop working in a
++ * well-defined way (rather than crashing the machine, for instance).
++ */
++#define ACM_INTERFACE_VERSION   0xAAAA000A
++
++/************************************************************************/
++
++/*
++ * Prototype for this hypercall is:
++ *  int acm_op(int cmd, void *args)
++ * @cmd  == ACMOP_??? (access control module operation).
++ * @args == Operation-specific extra arguments (NULL if none).
++ */
++
++
++#define ACMOP_setpolicy         1
++struct acm_setpolicy {
++    /* IN */
++    XEN_GUEST_HANDLE_64(void) pushcache;
++    uint32_t pushcache_size;
++};
++
++
++#define ACMOP_getpolicy         2
++struct acm_getpolicy {
++    /* IN */
++    XEN_GUEST_HANDLE_64(void) pullcache;
++    uint32_t pullcache_size;
++};
++
++
++#define ACMOP_dumpstats         3
++struct acm_dumpstats {
++    /* IN */
++    XEN_GUEST_HANDLE_64(void) pullcache;
++    uint32_t pullcache_size;
++};
++
++
++#define ACMOP_getssid           4
++#define ACM_GETBY_ssidref  1
++#define ACM_GETBY_domainid 2
++struct acm_getssid {
++    /* IN */
++    uint32_t get_ssid_by; /* ACM_GETBY_* */
++    union {
++        domaintype_t domainid;
++        ssidref_t    ssidref;
++    } id;
++    XEN_GUEST_HANDLE_64(void) ssidbuf;
++    uint32_t ssidbuf_size;
++};
++
++#define ACMOP_getdecision      5
++struct acm_getdecision {
++    /* IN */
++    uint32_t get_decision_by1; /* ACM_GETBY_* */
++    uint32_t get_decision_by2; /* ACM_GETBY_* */
++    union {
++        domaintype_t domainid;
++        ssidref_t    ssidref;
++    } id1;
++    union {
++        domaintype_t domainid;
++        ssidref_t    ssidref;
++    } id2;
++    uint32_t hook;
++    /* OUT */
++    uint32_t acm_decision;
++};
++
++
++#define ACMOP_chgpolicy        6
++struct acm_change_policy {
++    /* IN */
++    XEN_GUEST_HANDLE_64(void) policy_pushcache;
++    uint32_t policy_pushcache_size;
++    XEN_GUEST_HANDLE_64(void) del_array;
++    uint32_t delarray_size;
++    XEN_GUEST_HANDLE_64(void) chg_array;
++    uint32_t chgarray_size;
++    /* OUT */
++    /* array with error code */
++    XEN_GUEST_HANDLE_64(void) err_array;
++    uint32_t errarray_size;
++};
++
++#define ACMOP_relabeldoms       7
++struct acm_relabel_doms {
++    /* IN */
++    XEN_GUEST_HANDLE_64(void) relabel_map;
++    uint32_t relabel_map_size;
++    /* OUT */
++    XEN_GUEST_HANDLE_64(void) err_array;
++    uint32_t errarray_size;
++};
++
++/* future interface to Xen */
++struct xen_acmctl {
++    uint32_t cmd;
++    uint32_t interface_version;
++    union {
++        struct acm_setpolicy     setpolicy;
++        struct acm_getpolicy     getpolicy;
++        struct acm_dumpstats     dumpstats;
++        struct acm_getssid       getssid;
++        struct acm_getdecision   getdecision;
++        struct acm_change_policy change_policy;
++        struct acm_relabel_doms  relabel_doms;
++    } u;
++};
++
++typedef struct xen_acmctl xen_acmctl_t;
++DEFINE_XEN_GUEST_HANDLE(xen_acmctl_t);
++
++#endif /* __XEN_PUBLIC_ACM_OPS_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
diff --cc include/xen/interface/xsm/flask_op.h

index 0000000,0000000..e2dd403

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/interface/xsm/flask_op.h
@@@ -1,0 -1,0 +1,61 @@@
++/*
++ *  This file contains the flask_op hypercall commands and definitions.
++ *
++ *  Author:  George Coker, <gscoker@alpha.ncsc.mil>
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __FLASK_OP_H__
++#define __FLASK_OP_H__
++
++#define FLASK_LOAD              1
++#define FLASK_GETENFORCE        2
++#define FLASK_SETENFORCE        3
++#define FLASK_CONTEXT_TO_SID    4
++#define FLASK_SID_TO_CONTEXT    5
++#define FLASK_ACCESS            6
++#define FLASK_CREATE            7
++#define FLASK_RELABEL           8
++#define FLASK_USER              9
++#define FLASK_POLICYVERS        10
++#define FLASK_GETBOOL           11
++#define FLASK_SETBOOL           12
++#define FLASK_COMMITBOOLS       13
++#define FLASK_MLS               14
++#define FLASK_DISABLE           15
++#define FLASK_GETAVC_THRESHOLD  16
++#define FLASK_SETAVC_THRESHOLD  17
++#define FLASK_AVC_HASHSTATS     18
++#define FLASK_AVC_CACHESTATS    19
++#define FLASK_MEMBER            20
++#define FLASK_ADD_OCONTEXT      21
++#define FLASK_DEL_OCONTEXT      22
++
++#define FLASK_LAST              FLASK_DEL_OCONTEXT
++
++typedef struct flask_op {
++    uint32_t  cmd;
++    uint32_t  size;
++    char      *buf;
++} flask_op_t;
++
++DEFINE_XEN_GUEST_HANDLE(flask_op_t);
++
++#endif
diff --cc include/xen/net-util.h

index 0000000,0000000..8561e2c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/net-util.h
@@@ -1,0 -1,0 +1,75 @@@
++#ifndef __XEN_NETUTIL_H__
++#define __XEN_NETUTIL_H__
++
++#include <linux/kernel.h>
++#include <linux/skbuff.h>
++#include <linux/tcp.h>
++#include <linux/udp.h>
++#include <net/ip.h>
++
++static inline int skb_checksum_setup(struct sk_buff *skb,
++                                   unsigned long *fixup_counter)
++{
++      struct iphdr *iph = (void *)skb->data;
++      unsigned char *th;
++      __be16 *csum = NULL;
++      int err = -EPROTO;
++
++      if (skb->ip_summed != CHECKSUM_PARTIAL) {
++              /* A non-CHECKSUM_PARTIAL SKB does not require setup. */
++              if (!skb_is_gso(skb))
++                      return 0;
++
++              /*
++               * A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
++               * peers can fail to set NETRXF_csum_blank when sending a GSO
++               * frame. In this case force the SKB to CHECKSUM_PARTIAL and
++               * recalculate the partial checksum.
++               */
++              ++*fixup_counter;
++              --csum;
++      }
++
++      if (skb->protocol != htons(ETH_P_IP))
++              goto out;
++
++      th = skb->data + 4 * iph->ihl;
++      if (th >= skb_tail_pointer(skb))
++              goto out;
++
++      skb->csum_start = th - skb->head;
++      switch (iph->protocol) {
++      case IPPROTO_TCP:
++              skb->csum_offset = offsetof(struct tcphdr, check);
++              if (csum)
++                      csum = &((struct tcphdr *)th)->check;
++              break;
++      case IPPROTO_UDP:
++              skb->csum_offset = offsetof(struct udphdr, check);
++              if (csum)
++                      csum = &((struct udphdr *)th)->check;
++              break;
++      default:
++              if (net_ratelimit())
++                      pr_err("Attempting to checksum a non-"
++                             "TCP/UDP packet, dropping a protocol"
++                             " %d packet\n", iph->protocol);
++              goto out;
++      }
++
++      if ((th + skb->csum_offset + sizeof(*csum)) > skb_tail_pointer(skb))
++              goto out;
++
++      if (csum) {
++              *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
++                                         skb->len - iph->ihl*4,
++                                         IPPROTO_TCP, 0);
++              skb->ip_summed = CHECKSUM_PARTIAL;
++      }
++
++      err = 0;
++out:
++      return err;
++}
++
++#endif /* __XEN_NETUTIL_H__ */
diff --cc include/xen/pcifront.h

index 0000000,0000000..bb5713f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/pcifront.h
@@@ -1,0 -1,0 +1,69 @@@
++/*
++ * PCI Frontend - arch-dependendent declarations
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#ifndef __XEN_ASM_PCIFRONT_H__
++#define __XEN_ASM_PCIFRONT_H__
++
++#include <linux/spinlock.h>
++
++#ifdef __KERNEL__
++
++#ifndef __ia64__
++
++#include <asm/pci.h>
++
++struct pcifront_device;
++struct pci_bus;
++#define pcifront_sd pci_sysdata
++
++static inline struct pcifront_device *
++pcifront_get_pdev(struct pcifront_sd *sd)
++{
++      return sd->pdev;
++}
++
++static inline void pcifront_init_sd(struct pcifront_sd *sd,
++                                  unsigned int domain, unsigned int bus,
++                                  struct pcifront_device *pdev)
++{
++      sd->domain = domain;
++      sd->pdev = pdev;
++}
++
++static inline void pcifront_setup_root_resources(struct pci_bus *bus,
++                                               struct pcifront_sd *sd)
++{
++}
++
++#else /* __ia64__ */
++
++#include <linux/acpi.h>
++#include <asm/pci.h>
++#define pcifront_sd pci_controller
++
++extern void xen_add_resource(struct pci_controller *, unsigned int,
++                           unsigned int, struct acpi_resource *);
++extern void xen_pcibios_setup_root_windows(struct pci_bus *,
++                                         struct pci_controller *);
++
++static inline struct pcifront_device *
++pcifront_get_pdev(struct pcifront_sd *sd)
++{
++      return (struct pcifront_device *)sd->platform_data;
++}
++
++static inline void pcifront_setup_root_resources(struct pci_bus *bus,
++                                               struct pcifront_sd *sd)
++{
++      xen_pcibios_setup_root_windows(bus, sd);
++}
++
++#endif /* __ia64__ */
++
++extern struct rw_semaphore pci_bus_sem;
++
++#endif /* __KERNEL__ */
++
++#endif /* __XEN_ASM_PCIFRONT_H__ */
diff --cc include/xen/pcpu.h

index 0000000,0000000..2bf1871

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/pcpu.h
@@@ -1,0 -1,0 +1,18 @@@
++#ifndef _XEN_SYSCTL_H
++#define _XEN_SYSCTL_H
++
++#include <linux/notifier.h>
++
++int register_pcpu_notifier(struct notifier_block *);
++void unregister_pcpu_notifier(struct notifier_block *);
++
++#ifdef CONFIG_X86
++int __must_check rdmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no,
++                                  u32 *l, u32 *h);
++int __must_check wrmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no,
++                                  u32 l, u32 h);
++int __must_check rdmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs);
++int __must_check wrmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs);
++#endif
++
++#endif /* _XEN_SYSCTL_H */
diff --cc include/xen/privcmd.h

index 17857fb,17857fb..2a94439
--- 1/include/xen/privcmd.h
--- 2/include/xen/privcmd.h
+++ b/include/xen/privcmd.h
@@@ -1,77 -1,77 +1,3 @@@
--/******************************************************************************
-- * privcmd.h
-- *
-- * Interface to /proc/xen/privcmd.
-- *
-- * Copyright (c) 2003-2005, K A Fraser
-- *
-- * This program is free software; you can redistribute it and/or
-- * modify it under the terms of the GNU General Public License version 2
-- * as published by the Free Software Foundation; or, when distributed
-- * separately from the Linux kernel or incorporated into other
-- * software packages, subject to the following license:
-- *
-- * Permission is hereby granted, free of charge, to any person obtaining a copy
-- * of this source file (the "Software"), to deal in the Software without
-- * restriction, including without limitation the rights to use, copy, modify,
-- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
-- * and to permit persons to whom the Software is furnished to do so, subject to
-- * the following conditions:
-- *
-- * The above copyright notice and this permission notice shall be included in
-- * all copies or substantial portions of the Software.
-- *
-- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-- * IN THE SOFTWARE.
-- */
--
--#ifndef __LINUX_PUBLIC_PRIVCMD_H__
--#define __LINUX_PUBLIC_PRIVCMD_H__
--
--#include <linux/types.h>
--#include <linux/compiler.h>
--
--typedef unsigned long xen_pfn_t;
--
--struct privcmd_hypercall {
--      __u64 op;
--      __u64 arg[5];
--};
--
--struct privcmd_mmap_entry {
--      __u64 va;
--      __u64 mfn;
--      __u64 npages;
--};
--
--struct privcmd_mmap {
--      int num;
--      domid_t dom; /* target domain */
--      struct privcmd_mmap_entry __user *entry;
--};
--
--struct privcmd_mmapbatch {
--      int num;     /* number of pages to populate */
--      domid_t dom; /* target domain */
--      __u64 addr;  /* virtual address */
--      xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
--};
--
--/*
-- * @cmd: IOCTL_PRIVCMD_HYPERCALL
-- * @arg: &privcmd_hypercall_t
-- * Return: Value returned from execution of the specified hypercall.
-- */
--#define IOCTL_PRIVCMD_HYPERCALL                                       \
--      _IOC(_IOC_NONE, 'P', 0, sizeof(struct privcmd_hypercall))
--#define IOCTL_PRIVCMD_MMAP                                    \
--      _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap))
--#define IOCTL_PRIVCMD_MMAPBATCH                                       \
--      _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch))
--
--#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
++#if defined(CONFIG_PARAVIRT_XEN) || !defined(__KERNEL__)
++#include "public/privcmd.h"
++#endif
diff --cc include/xen/public/Kbuild

index 0000000,0000000..d4f1aa8

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/public/Kbuild
@@@ -1,0 -1,0 +1,5 @@@
++header-y += evtchn.h
++header-y += gntdev.h
++header-y += iomulti.h
++header-y += privcmd.h
++header-y += xenbus.h
diff --cc include/xen/public/evtchn.h

index 0000000,0000000..938d4da

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/public/evtchn.h
@@@ -1,0 -1,0 +1,88 @@@
++/******************************************************************************
++ * evtchn.h
++ * 
++ * Interface to /dev/xen/evtchn.
++ * 
++ * Copyright (c) 2003-2005, K A Fraser
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __LINUX_PUBLIC_EVTCHN_H__
++#define __LINUX_PUBLIC_EVTCHN_H__
++
++/*
++ * Bind a fresh port to VIRQ @virq.
++ * Return allocated port.
++ */
++#define IOCTL_EVTCHN_BIND_VIRQ                                \
++      _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
++struct ioctl_evtchn_bind_virq {
++      unsigned int virq;
++};
++
++/*
++ * Bind a fresh port to remote <@remote_domain, @remote_port>.
++ * Return allocated port.
++ */
++#define IOCTL_EVTCHN_BIND_INTERDOMAIN                 \
++      _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
++struct ioctl_evtchn_bind_interdomain {
++      unsigned int remote_domain, remote_port;
++};
++
++/*
++ * Allocate a fresh port for binding to @remote_domain.
++ * Return allocated port.
++ */
++#define IOCTL_EVTCHN_BIND_UNBOUND_PORT                        \
++      _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
++struct ioctl_evtchn_bind_unbound_port {
++      unsigned int remote_domain;
++};
++
++/*
++ * Unbind previously allocated @port.
++ */
++#define IOCTL_EVTCHN_UNBIND                           \
++      _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
++struct ioctl_evtchn_unbind {
++      unsigned int port;
++};
++
++/*
++ * Unbind previously allocated @port.
++ */
++#define IOCTL_EVTCHN_NOTIFY                           \
++      _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
++struct ioctl_evtchn_notify {
++      unsigned int port;
++};
++
++/* Clear and reinitialise the event buffer. Clear error condition. */
++#define IOCTL_EVTCHN_RESET                            \
++      _IOC(_IOC_NONE, 'E', 5, 0)
++
++#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
diff --cc include/xen/public/gntdev.h

index 0000000,0000000..5304bd3

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/public/gntdev.h
@@@ -1,0 -1,0 +1,150 @@@
++/******************************************************************************
++ * gntdev.h
++ * 
++ * Interface to /dev/xen/gntdev.
++ * 
++ * Copyright (c) 2007, D G Murray
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __LINUX_PUBLIC_GNTDEV_H__
++#define __LINUX_PUBLIC_GNTDEV_H__
++
++struct ioctl_gntdev_grant_ref {
++      /* The domain ID of the grant to be mapped. */
++      uint32_t domid;
++      /* The grant reference of the grant to be mapped. */
++      uint32_t ref;
++};
++
++/*
++ * Inserts the grant references into the mapping table of an instance
++ * of gntdev. N.B. This does not perform the mapping, which is deferred
++ * until mmap() is called with @index as the offset.
++ */
++#define IOCTL_GNTDEV_MAP_GRANT_REF \
++_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
++struct ioctl_gntdev_map_grant_ref {
++      /* IN parameters */
++      /* The number of grants to be mapped. */
++      uint32_t count;
++      uint32_t pad;
++      /* OUT parameters */
++      /* The offset to be used on a subsequent call to mmap(). */
++      uint64_t index;
++      /* Variable IN parameter. */
++      /* Array of grant references, of size @count. */
++      struct ioctl_gntdev_grant_ref refs[1];
++};
++
++/*
++ * Removes the grant references from the mapping table of an instance of
++ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
++ * before this ioctl is called, or an error will result.
++ */
++#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
++_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
++struct ioctl_gntdev_unmap_grant_ref {
++      /* IN parameters */
++      /* The offset was returned by the corresponding map operation. */
++      uint64_t index;
++      /* The number of pages to be unmapped. */
++      uint32_t count;
++      uint32_t pad;
++};
++
++/*
++ * Returns the offset in the driver's address space that corresponds
++ * to @vaddr. This can be used to perform a munmap(), followed by an
++ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
++ * the caller. The number of pages that were allocated at the same time as
++ * @vaddr is returned in @count.
++ *
++ * N.B. Where more than one page has been mapped into a contiguous range, the
++ *      supplied @vaddr must correspond to the start of the range; otherwise
++ *      an error will result. It is only possible to munmap() the entire
++ *      contiguously-allocated range at once, and not any subrange thereof.
++ */
++#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
++_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
++struct ioctl_gntdev_get_offset_for_vaddr {
++      /* IN parameters */
++      /* The virtual address of the first mapped page in a range. */
++      uint64_t vaddr;
++      /* OUT parameters */
++      /* The offset that was used in the initial mmap() operation. */
++      uint64_t offset;
++      /* The number of pages mapped in the VM area that begins at @vaddr. */
++      uint32_t count;
++      uint32_t pad;
++};
++
++/*
++ * Sets the maximum number of grants that may mapped at once by this gntdev
++ * instance.
++ *
++ * N.B. This must be called before any other ioctl is performed on the device.
++ */
++#define IOCTL_GNTDEV_SET_MAX_GRANTS \
++_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
++struct ioctl_gntdev_set_max_grants {
++      /* IN parameter */
++      /* The maximum number of grants that may be mapped at once. */
++      uint32_t count;
++};
++
++/*
++ * Sets up an unmap notification within the page, so that the other side can do
++ * cleanup if this side crashes. Required to implement cross-domain robust
++ * mutexes or close notification on communication channels.
++ *
++ * Each mapped page only supports one notification; multiple calls referring to
++ * the same page overwrite the previous notification. You must clear the
++ * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it
++ * to occur.
++ */
++#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \
++_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntdev_unmap_notify))
++struct ioctl_gntdev_unmap_notify {
++      /* IN parameters */
++      /* Offset in the file descriptor for a byte within the page (same as
++       * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to
++       * be cleared. Otherwise, it can be any byte in the page whose
++       * notification we are adjusting.
++       */
++      uint64_t index;
++      /* Action(s) to take on unmap */
++      uint32_t action;
++      /* Event channel to notify */
++      uint32_t event_channel_port;
++};
++
++/* Clear (set to zero) the byte specified by index */
++#define UNMAP_NOTIFY_CLEAR_BYTE 0x1
++/* Send an interrupt on the indicated event channel */
++#define UNMAP_NOTIFY_SEND_EVENT 0x2
++
++#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
diff --cc include/xen/public/iomulti.h

index 0000000,0000000..ae973f6

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/public/iomulti.h
@@@ -1,0 -1,0 +1,50 @@@
++#ifndef __LINUX_PUBLIC_IOMULTI_H__
++#define __LINUX_PUBLIC_IOMULTI_H__
++/*
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
++ *
++ * Copyright (c) 2009 Isaku Yamahata
++ *                    VA Linux Systems Japan K.K.
++ */
++
++struct pci_iomul_setup {
++      uint16_t        segment;
++      uint8_t         bus;
++      uint8_t         dev;
++      uint8_t         func;
++};
++
++struct pci_iomul_in {
++      uint8_t         bar;
++      uint64_t        offset;
++
++      uint8_t         size;
++      uint32_t        value;
++};
++
++struct pci_iomul_out {
++      uint8_t         bar;
++      uint64_t        offset;
++
++      uint8_t         size;
++      uint32_t        value;
++};
++
++#define PCI_IOMUL_SETUP               _IOW ('P', 0, struct pci_iomul_setup)
++#define PCI_IOMUL_DISABLE_IO  _IO  ('P', 1)
++#define PCI_IOMUL_IN          _IOWR('P', 2, struct pci_iomul_in)
++#define PCI_IOMUL_OUT         _IOW ('P', 3, struct pci_iomul_out)
++
++#endif /* __LINUX_PUBLIC_IOMULTI_H__ */
diff --cc include/xen/public/privcmd.h

index 0000000,0000000..dba4e2e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/public/privcmd.h
@@@ -1,0 -1,0 +1,86 @@@
++/******************************************************************************
++ * privcmd.h
++ * 
++ * Interface to /proc/xen/privcmd.
++ * 
++ * Copyright (c) 2003-2005, K A Fraser
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __LINUX_PUBLIC_PRIVCMD_H__
++#define __LINUX_PUBLIC_PRIVCMD_H__
++
++#include <linux/types.h>
++#include <linux/compiler.h>
++
++typedef struct privcmd_hypercall
++{
++      __u64 op;
++      __u64 arg[5];
++} privcmd_hypercall_t;
++
++typedef struct privcmd_mmap_entry {
++      __u64 va;
++      __u64 mfn;
++      __u64 npages;
++} privcmd_mmap_entry_t; 
++
++typedef struct privcmd_mmap {
++      int num;
++      domid_t dom; /* target domain */
++      privcmd_mmap_entry_t __user *entry;
++} privcmd_mmap_t; 
++
++typedef struct privcmd_mmapbatch {
++      int num;     /* number of pages to populate */
++      domid_t dom; /* target domain */
++      __u64 addr;  /* virtual address */
++      xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
++} privcmd_mmapbatch_t; 
++
++typedef struct privcmd_mmapbatch_v2 {
++      unsigned int num; /* number of pages to populate */
++      domid_t dom;      /* target domain */
++      __u64 addr;       /* virtual address */
++      const xen_pfn_t __user *arr; /* array of mfns */
++      int __user *err;  /* array of error codes */
++} privcmd_mmapbatch_v2_t;
++
++/*
++ * @cmd: IOCTL_PRIVCMD_HYPERCALL
++ * @arg: &privcmd_hypercall_t
++ * Return: Value returned from execution of the specified hypercall.
++ */
++#define IOCTL_PRIVCMD_HYPERCALL                                       \
++      _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
++#define IOCTL_PRIVCMD_MMAP                                    \
++      _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
++#define IOCTL_PRIVCMD_MMAPBATCH                                       \
++      _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
++#define IOCTL_PRIVCMD_MMAPBATCH_V2                            \
++      _IOC(_IOC_NONE, 'P', 4, sizeof(privcmd_mmapbatch_v2_t))
++
++#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
diff --cc include/xen/public/xenbus.h

index 0000000,0000000..fd61373

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/public/xenbus.h
@@@ -1,0 -1,0 +1,52 @@@
++/******************************************************************************
++ * xenbus.h
++ * 
++ * Interface to /proc/xen/xenbus.
++ * 
++ * Copyright (c) 2008, Diego Ongaro <diego.ongaro@citrix.com>
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __LINUX_PUBLIC_XENBUS_H__
++#define __LINUX_PUBLIC_XENBUS_H__
++
++#include <linux/types.h>
++
++typedef struct xenbus_alloc {
++      domid_t dom;
++      __u32 port;
++      __u32 grant_ref;
++} xenbus_alloc_t;
++
++/*
++ * @cmd: IOCTL_XENBUS_ALLOC
++ * @arg: &xenbus_alloc_t
++ * Return: 0, or -1 for error
++ */
++#define IOCTL_XENBUS_ALLOC                                    \
++      _IOC(_IOC_NONE, 'X', 0, sizeof(xenbus_alloc_t))
++
++#endif /* __LINUX_PUBLIC_XENBUS_H__ */
diff --cc include/xen/sysctl.h

index 0000000,0000000..7fe9250

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/sysctl.h
@@@ -1,0 -1,0 +1,11 @@@
++#ifndef _XEN_SYSCTL_H
++#define _XEN_SYSCTL_H
++
++/* CTL_XEN names: */
++enum
++{
++      CTL_XEN_INDEPENDENT_WALLCLOCK=1,
++      CTL_XEN_PERMITTED_CLOCK_JITTER=2,
++};
++
++#endif /* _XEN_SYSCTL_H */
diff --cc include/xen/xen.h

index a164024,a164024..edb2f5a
--- 1/include/xen/xen.h
--- 2/include/xen/xen.h
+++ b/include/xen/xen.h
@@@ -7,8 -7,8 +7,10 @@@ enum xen_domain_type 
         XEN_HVM_DOMAIN,         /* running in a Xen hvm domain */
   };
   
--#ifdef CONFIG_XEN
++#if defined(CONFIG_PARAVIRT_XEN)
   extern enum xen_domain_type xen_domain_type;
++#elif defined(CONFIG_XEN)
++#define xen_domain_type               XEN_PV_DOMAIN
   #else
   #define xen_domain_type               XEN_NATIVE
   #endif
@@@ -25,6 -25,6 +27,8 @@@
   
   #define xen_initial_domain()  (xen_pv_domain() && \
                                  xen_start_info->flags & SIF_INITDOMAIN)
++#elif defined(CONFIG_XEN)
++#define xen_initial_domain()  is_initial_xendomain()
   #else  /* !CONFIG_XEN_DOM0 */
   #define xen_initial_domain()  (0)
   #endif        /* CONFIG_XEN_DOM0 */
diff --cc include/xen/xen_proc.h

index 0000000,0000000..44af17c

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/xen_proc.h
@@@ -1,0 -1,0 +1,12 @@@
++
++#ifndef __ASM_XEN_PROC_H__
++#define __ASM_XEN_PROC_H__
++
++#include <linux/proc_fs.h>
++
++extern struct proc_dir_entry *create_xen_proc_entry(
++      const char *name, mode_t mode);
++extern void remove_xen_proc_entry(
++      const char *name);
++
++#endif /* __ASM_XEN_PROC_H__ */
diff --cc include/xen/xenbus.h

index 5467369,5467369..8bd4f2b
--- 1/include/xen/xenbus.h
--- 2/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@@ -40,6 -40,6 +40,7 @@@
   #include <linux/completion.h>
   #include <linux/init.h>
   #include <linux/slab.h>
++#include <linux/err.h>
   #include <xen/interface/xen.h>
   #include <xen/interface/grant_table.h>
   #include <xen/interface/io/xenbus.h>
@@@ -56,8 -56,8 +57,21 @@@ struct xenbus_watc
         /* Callback (executed in a process context with no locks held). */
         void (*callback)(struct xenbus_watch *,
                          const char **vec, unsigned int len);
++
++#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
++      /* See XBWF_ definitions below. */
++      unsigned long flags;
++#endif
   };
   
++#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
++/*
++ * Execute callback in its own kthread. Useful if the callback is long
++ * running or heavily serialised, to avoid taking out the main xenwatch thread
++ * for a long period of time (or even unwittingly causing a deadlock).
++ */
++#define XBWF_new_thread       1
++#endif
   
   /* A xenbus device. */
   struct xenbus_device {
@@@ -84,8 -84,8 +98,7 @@@ struct xenbus_device_i
   
   /* A xenbus driver. */
   struct xenbus_driver {
--      char *name;
--      struct module *owner;
++      const char *name;
         const struct xenbus_device_id *ids;
         int (*probe)(struct xenbus_device *dev,
                      const struct xenbus_device_id *id);
@@@ -93,6 -93,6 +106,9 @@@
                                  enum xenbus_state backend_state);
         int (*remove)(struct xenbus_device *dev);
         int (*suspend)(struct xenbus_device *dev);
++#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
++      int (*suspend_cancel)(struct xenbus_device *dev);
++#endif
         int (*resume)(struct xenbus_device *dev);
         int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *);
         struct device_driver driver;
@@@ -112,7 -112,7 +128,6 @@@ int __must_check __xenbus_register_fron
   static inline int __must_check
   xenbus_register_frontend(struct xenbus_driver *drv)
   {
--      WARN_ON(drv->owner != THIS_MODULE);
         return __xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME);
   }
   
@@@ -122,7 -122,7 +137,6 @@@ int __must_check __xenbus_register_back
   static inline int __must_check
   xenbus_register_backend(struct xenbus_driver *drv)
   {
--      WARN_ON(drv->owner != THIS_MODULE);
         return __xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME);
   }
   
@@@ -165,7 -165,7 +179,6 @@@ int xenbus_printf(struct xenbus_transac
   int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
   
   /* notifer routines for when the xenstore comes up */
--extern int xenstored_ready;
   int register_xenstore_notifier(struct notifier_block *nb);
   void unregister_xenstore_notifier(struct notifier_block *nb);
   
@@@ -179,11 -179,11 +192,11 @@@ void xs_suspend_cancel(void)
   void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
   
   struct work_struct;
++void xenbus_probe(struct work_struct *);
   
   /* Prepare for domain suspend: then resume or cancel the suspend. */
   void xenbus_suspend(void);
   void xenbus_resume(void);
--void xenbus_probe(struct work_struct *);
   void xenbus_suspend_cancel(void);
   
   #define XENBUS_IS_ERR_READ(str) ({                    \
@@@ -196,38 -196,38 +209,127 @@@
   
   #define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
   
++
++/**
++ * Register a watch on the given path, using the given xenbus_watch structure
++ * for storage, and the given callback function as the callback.  Return 0 on
++ * success, or -errno on error.  On success, the given path will be saved as
++ * watch->node, and remains the caller's to free.  On error, watch->node will
++ * be NULL, the device will switch to XenbusStateClosing, and the error will
++ * be saved in the store.
++ */
   int xenbus_watch_path(struct xenbus_device *dev, const char *path,
                       struct xenbus_watch *watch,
                       void (*callback)(struct xenbus_watch *,
                                        const char **, unsigned int));
++
++
++#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
++/**
++ * Register a watch on the given path/path2, using the given xenbus_watch
++ * structure for storage, and the given callback function as the callback.
++ * Return 0 on success, or -errno on error.  On success, the watched path
++ * (path/path2) will be saved as watch->node, and becomes the caller's to
++ * kfree().  On error, watch->node will be NULL, so the caller has nothing to
++ * free, the device will switch to XenbusStateClosing, and the error will be
++ * saved in the store.
++ */
++int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
++                     const char *path2, struct xenbus_watch *watch,
++                     void (*callback)(struct xenbus_watch *,
++                                      const char **, unsigned int));
++#else
   int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
                          void (*callback)(struct xenbus_watch *,
                                           const char **, unsigned int),
                          const char *pathfmt, ...)
         __attribute__ ((format (printf, 4, 5)));
++#endif
   
++/**
++ * Advertise in the store a change of the given driver to the given new_state.
++ * Return 0 on success, or -errno on error.  On error, the device will switch
++ * to XenbusStateClosing, and the error will be saved in the store.
++ */
   int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
++
++/**
++ * Grant access to the given ring_mfn to the peer of the given device.  Return
++ * 0 on success, or -errno on error.  On error, the device will switch to
++ * XenbusStateClosing, and the error will be saved in the store.
++ */
   int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
--int xenbus_map_ring_valloc(struct xenbus_device *dev,
--                         int gnt_ref, void **vaddr);
--int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
++
++/**
++ * Map a page of memory into this domain from another domain's grant table.
++ * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
++ * page to that address, and sets *vaddr to that address.
++ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
++ * or -ENOMEM on error. If an error is returned, device will switch to
++ * XenbusStateClosing and the error message will be saved in XenStore.
++ */
++struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev,
++                                       grant_ref_t ref);
++int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t gnt_ref,
                            grant_handle_t *handle, void *vaddr);
   
--int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
++/**
++ * Unmap a page of memory in this domain that was imported from another domain
++ * and free the virtual address space.
++ * Returns 0 on success and returns GNTST_* on error
++ * (see xen/include/interface/grant_table.h).
++ */
++int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *);
   int xenbus_unmap_ring(struct xenbus_device *dev,
                       grant_handle_t handle, void *vaddr);
   
++/**
++ * Allocate an event channel for the given xenbus_device, assigning the newly
++ * created local port to *port.  Return 0 on success, or -errno on error.  On
++ * error, the device will switch to XenbusStateClosing, and the error will be
++ * saved in the store.
++ */
   int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
--int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
++
++
++/**
++ * Free an existing event channel. Returns 0 on success or -errno on error.
++ */
   int xenbus_free_evtchn(struct xenbus_device *dev, int port);
   
++
++/**
++ * Return the state of the driver rooted at the given store path, or
++ * XenbusStateUnknown if no state can be read.
++ */
   enum xenbus_state xenbus_read_driver_state(const char *path);
   
--void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...);
--void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...);
++
++/***
++ * Report the given negative errno into the store, along with the given
++ * formatted message.
++ */
++void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
++                    ...);
++
++
++/***
++ * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
++ * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
++ * closedown of this driver and its peer.
++ */
++void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
++                    ...);
++
++#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
++int xenbus_dev_init(void);
++#endif
   
   const char *xenbus_strstate(enum xenbus_state state);
   int xenbus_dev_is_online(struct xenbus_device *dev);
   int xenbus_frontend_closed(struct xenbus_device *dev);
   
++int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *));
++int xenbus_for_each_frontend(void *arg, int (*fn)(struct device *, void *));
++
   #endif /* _XEN_XENBUS_H */
diff --cc include/xen/xencons.h

index 0000000,0000000..c5a55d6

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/xencons.h
@@@ -1,0 -1,0 +1,17 @@@
++#ifndef __ASM_XENCONS_H__
++#define __ASM_XENCONS_H__
++
++struct dom0_vga_console_info;
++void dom0_init_screen_info(const struct dom0_vga_console_info *, size_t);
++
++void xencons_force_flush(void);
++void xencons_resume(void);
++
++/* Interrupt work hooks. Receive data, or kick data out. */
++void xencons_rx(char *buf, unsigned len);
++void xencons_tx(void);
++
++int xencons_ring_init(void);
++int xencons_ring_send(const char *data, unsigned len);
++
++#endif /* __ASM_XENCONS_H__ */
diff --cc include/xen/xenoprof.h

index 0000000,0000000..4c3ab0f

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/include/xen/xenoprof.h
@@@ -1,0 -1,0 +1,42 @@@
++/******************************************************************************
++ * xen/xenoprof.h
++ *
++ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
++ *                    VA Linux Systems Japan K.K.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ *
++ */
++
++#ifndef __XEN_XENOPROF_H__
++#define __XEN_XENOPROF_H__
++#ifdef CONFIG_XEN
++
++#include <asm/xenoprof.h>
++
++struct oprofile_operations;
++int xenoprofile_init(struct oprofile_operations * ops);
++void xenoprofile_exit(void);
++
++struct xenoprof_shared_buffer {
++      char                                    *buffer;
++      struct xenoprof_arch_shared_buffer      arch;
++};
++#else
++#define xenoprofile_init(ops) (-ENOSYS)
++#define xenoprofile_exit()    do { } while (0)
++
++#endif /* CONFIG_XEN */
++#endif /* __XEN_XENOPROF_H__ */
diff --cc init/Kconfig
Simple merge
diff --cc init/main.c

index 494a2f7,4a9479e..cc00143
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -489,9 -487,7 +489,8 @@@ asmlinkage void __init start_kernel(voi
         printk(KERN_NOTICE "%s", linux_banner);
         setup_arch(&command_line);
         mm_init_owner(&init_mm, &init_task);
-       mm_init_cpumask(&init_mm);
         setup_command_line(command_line);
+ +      unwind_setup();
         setup_nr_cpu_ids();
         setup_per_cpu_areas();
         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
diff --cc kernel/Kconfig.preempt

index dfce8f5,bf987b9..154787d
--- 1/kernel/Kconfig.preempt
--- 2/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@@ -36,6 -35,6 +36,7 @@@ config PREEMPT_VOLUNTAR
   
   config PREEMPT
         bool "Preemptible Kernel (Low-Latency Desktop)"
++      depends on !XEN
         help
           This option reduces the latency of the kernel by making
           all kernel code (that is not executing in a critical section)
diff --cc kernel/Makefile
Simple merge
diff --cc kernel/capability.c

index c2c7ff3,32a80e0..25674a3
--- 1/kernel/capability.c
--- 2/kernel/capability.c
+++ b/kernel/capability.c
@@@ -22,10 -22,14 +22,14 @@@
    */
   
   const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
+ const kernel_cap_t __cap_full_set = CAP_FULL_SET;
+ const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
   
   EXPORT_SYMBOL(__cap_empty_set);
+ EXPORT_SYMBOL(__cap_full_set);
+ EXPORT_SYMBOL(__cap_init_eff_set);
   
- -int file_caps_enabled = 1;
+ +int file_caps_enabled;
   
   static int __init file_caps_disable(char *str)
   {
diff --cc kernel/irq/spurious.c

index dfbd550,dfbd550..edb0437
--- 1/kernel/irq/spurious.c
--- 2/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@@ -271,7 -271,7 +271,7 @@@ void note_interrupt(unsigned int irq, s
                  */
                 if (time_after(jiffies, desc->last_unhandled + HZ/10))
                         desc->irqs_unhandled = 1;
--              else
++              else if (!irq_ignore_unhandled(irq))
                         desc->irqs_unhandled++;
                 desc->last_unhandled = jiffies;
                 if (unlikely(action_ret != IRQ_NONE))
diff --cc kernel/kexec.c

index 8d814cb,87b77de..39394f8
--- 1/kernel/kexec.c
--- 2/kernel/kexec.c
+++ b/kernel/kexec.c
@@@ -41,12 -41,12 +41,18 @@@
   #include <asm/system.h>
   #include <asm/sections.h>
   
++#ifndef CONFIG_XEN
   /* Per cpu memory for storing cpu states in case of system crash. */
   note_buf_t __percpu *crash_notes;
++#endif
   
   /* vmcoreinfo stuff */
   static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
--u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
++u32
++#if defined(CONFIG_XEN) && defined(CONFIG_X86)
++__page_aligned_bss
++#endif
++vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
   size_t vmcoreinfo_size;
   size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
   
@@@ -357,13 -357,13 +363,26 @@@ static int kimage_is_destination_range(
         return 0;
   }
   
--static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
++static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order, unsigned long limit)
   {
         struct page *pages;
   
         pages = alloc_pages(gfp_mask, order);
         if (pages) {
                 unsigned int count, i;
++#ifdef CONFIG_XEN
++              int address_bits;
++
++              if (limit == ~0UL)
++                      address_bits = BITS_PER_LONG;
++              else
++                      address_bits = ilog2(limit);
++
++              if (xen_limit_pages_to_max_mfn(pages, order, address_bits) < 0) {
++                      __free_pages(pages, order);
++                      return NULL;
++              }
++#endif
                 pages->mapping = NULL;
                 set_page_private(pages, order);
                 count = 1 << order;
@@@ -427,10 -427,10 +446,10 @@@ static struct page *kimage_alloc_normal
         do {
                 unsigned long pfn, epfn, addr, eaddr;
   
--              pages = kimage_alloc_pages(GFP_KERNEL, order);
++              pages = kimage_alloc_pages(GFP_KERNEL, order, KEXEC_CONTROL_MEMORY_LIMIT);
                 if (!pages)
                         break;
--              pfn   = page_to_pfn(pages);
++              pfn   = kexec_page_to_pfn(pages);
                 epfn  = pfn + count;
                 addr  = pfn << PAGE_SHIFT;
                 eaddr = epfn << PAGE_SHIFT;
@@@ -464,6 -464,6 +483,7 @@@
         return pages;
   }
   
++#ifndef CONFIG_XEN
   static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
                                                       unsigned int order)
   {
@@@ -517,7 -517,7 +537,7 @@@
                 }
                 /* If I don't overlap any segments I have found my hole! */
                 if (i == image->nr_segments) {
--                      pages = pfn_to_page(hole_start >> PAGE_SHIFT);
++                      pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
                         break;
                 }
         }
@@@ -544,6 -544,6 +564,13 @@@ struct page *kimage_alloc_control_pages
   
         return pages;
   }
++#else /* !CONFIG_XEN */
++struct page *kimage_alloc_control_pages(struct kimage *image,
++                                       unsigned int order)
++{
++      return kimage_alloc_normal_control_pages(image, order);
++}
++#endif
   
   static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
   {
@@@ -559,7 -559,7 +586,7 @@@
                         return -ENOMEM;
   
                 ind_page = page_address(page);
--              *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
++              *image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
                 image->entry = ind_page;
                 image->last_entry = ind_page +
                                       ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@@ -618,13 -618,13 +645,13 @@@ static void kimage_terminate(struct kim
   #define for_each_kimage_entry(image, ptr, entry) \
         for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
                 ptr = (entry & IND_INDIRECTION)? \
--                      phys_to_virt((entry & PAGE_MASK)): ptr +1)
++                      kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
   
   static void kimage_free_entry(kimage_entry_t entry)
   {
         struct page *page;
   
--      page = pfn_to_page(entry >> PAGE_SHIFT);
++      page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
         kimage_free_pages(page);
   }
   
@@@ -636,6 -636,6 +663,10 @@@ static void kimage_free(struct kimage *
         if (!image)
                 return;
   
++#ifdef CONFIG_XEN
++      xen_machine_kexec_unload(image);
++#endif
++
         kimage_free_extra_pages(image);
         for_each_kimage_entry(image, ptr, entry) {
                 if (entry & IND_INDIRECTION) {
@@@ -711,7 -711,7 +742,7 @@@ static struct page *kimage_alloc_page(s
          * have a match.
          */
         list_for_each_entry(page, &image->dest_pages, lru) {
--              addr = page_to_pfn(page) << PAGE_SHIFT;
++              addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
                 if (addr == destination) {
                         list_del(&page->lru);
                         return page;
@@@ -722,16 -722,16 +753,16 @@@
                 kimage_entry_t *old;
   
                 /* Allocate a page, if we run out of memory give up */
--              page = kimage_alloc_pages(gfp_mask, 0);
++              page = kimage_alloc_pages(gfp_mask, 0, KEXEC_SOURCE_MEMORY_LIMIT);
                 if (!page)
                         return NULL;
                 /* If the page cannot be used file it away */
--              if (page_to_pfn(page) >
++              if (kexec_page_to_pfn(page) >
                                 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
                         list_add(&page->lru, &image->unuseable_pages);
                         continue;
                 }
--              addr = page_to_pfn(page) << PAGE_SHIFT;
++              addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
   
                 /* If it is the destination page we want use it */
                 if (addr == destination)
@@@ -754,7 -754,7 +785,7 @@@
                         struct page *old_page;
   
                         old_addr = *old & PAGE_MASK;
--                      old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
++                      old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
                         copy_highpage(page, old_page);
                         *old = addr | (*old & ~PAGE_MASK);
   
@@@ -810,7 -810,7 +841,7 @@@ static int kimage_load_normal_segment(s
                         result  = -ENOMEM;
                         goto out;
                 }
--              result = kimage_add_page(image, page_to_pfn(page)
++              result = kimage_add_page(image, kexec_page_to_pfn(page)
                                                                 << PAGE_SHIFT);
                 if (result < 0)
                         goto out;
@@@ -842,6 -842,6 +873,7 @@@ out
         return result;
   }
   
++#ifndef CONFIG_XEN
   static int kimage_load_crash_segment(struct kimage *image,
                                         struct kexec_segment *segment)
   {
@@@ -864,7 -864,7 +896,7 @@@
                 char *ptr;
                 size_t uchunk, mchunk;
   
--              page = pfn_to_page(maddr >> PAGE_SHIFT);
++              page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
                 if (!page) {
                         result  = -ENOMEM;
                         goto out;
@@@ -913,6 -913,6 +945,13 @@@ static int kimage_load_segment(struct k
   
         return result;
   }
++#else /* CONFIG_XEN */
++static int kimage_load_segment(struct kimage *image,
++                              struct kexec_segment *segment)
++{
++      return kimage_load_normal_segment(image, segment);
++}
++#endif
   
   /*
    * Exec Kernel system call: for obvious reasons only root may call it.
@@@ -1016,6 -1016,6 +1055,13 @@@ SYSCALL_DEFINE4(kexec_load, unsigned lo
                 }
                 kimage_terminate(image);
         }
++#ifdef CONFIG_XEN
++      if (image) {
++              result = xen_machine_kexec_load(image);
++              if (result)
++                      goto out;
++      }
++#endif
         /* Install the new kernel, and  Uninstall the old */
         image = xchg(dest_image, image);
   
@@@ -1176,6 -1176,6 +1222,7 @@@ static void final_note(u32 *buf
         memcpy(buf, &note, sizeof(note));
   }
   
++#ifndef CONFIG_XEN
   void crash_save_cpu(struct pt_regs *regs, int cpu)
   {
         struct elf_prstatus prstatus;
@@@ -1201,9 -1201,9 +1248,11 @@@
                               &prstatus, sizeof(prstatus));
         final_note(buf);
   }
++#endif
   
   static int __init crash_notes_memory_init(void)
   {
++#ifndef CONFIG_XEN
         /* Allocate memory for saving cpu registers. */
         crash_notes = alloc_percpu(note_buf_t);
         if (!crash_notes) {
@@@ -1211,11 -1211,11 +1260,13 @@@
                 " states failed\n");
                 return -ENOMEM;
         }
++#endif
         return 0;
   }
   module_init(crash_notes_memory_init)
   
   
++#ifndef CONFIG_XEN
   /*
    * parsing the "crashkernel" commandline
    *
@@@ -1378,7 -1378,7 +1429,7 @@@ int __init parse_crashkernel(char                *c
   
         return 0;
   }
--
++#endif
   
   
   void crash_save_vmcoreinfo(void)
@@@ -1435,7 -1435,7 +1486,18 @@@ static int __init crash_save_vmcoreinfo
   
         VMCOREINFO_SYMBOL(init_uts_ns);
         VMCOREINFO_SYMBOL(node_online_map);
++#ifndef CONFIG_X86_XEN
+       VMCOREINFO_SYMBOL(swapper_pg_dir);
++#else
++/*
++ * Since for x86-32 Xen swapper_pg_dir is a pointer rather than an array,
++ * make the value stored consistent with native (i.e. the base address of
++ * the page directory).
++ */
++# define swapper_pg_dir *swapper_pg_dir
+ +      VMCOREINFO_SYMBOL(swapper_pg_dir);
++# undef swapper_pg_dir
++#endif
         VMCOREINFO_SYMBOL(_stext);
         VMCOREINFO_SYMBOL(vmlist);
   
diff --cc kernel/kmod.c
Simple merge
diff --cc kernel/ksysfs.c

index 292a4c6,0b624e7..9e672fa
--- 1/kernel/ksysfs.c
--- 2/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@@ -166,32 -157,7 +157,31 @@@ static struct bin_attribute notes_attr 
   struct kobject *kernel_kobj;
   EXPORT_SYMBOL_GPL(kernel_kobj);
   
+ +#ifdef CONFIG_ENTERPRISE_SUPPORT
+ +const char *supported_printable(int taint)
+ +{
+ +      int mask = TAINT_PROPRIETARY_MODULE|TAINT_NO_SUPPORT;
+ +      if ((taint & mask) == mask)
+ +              return "No, Proprietary and Unsupported modules are loaded";
+ +      else if (taint & TAINT_PROPRIETARY_MODULE)
+ +              return "No, Proprietary modules are loaded";
+ +      else if (taint & TAINT_NO_SUPPORT)
+ +              return "No, Unsupported modules are loaded";
+ +      else if (taint & TAINT_EXTERNAL_SUPPORT)
+ +              return "Yes, External";
+ +      else
+ +              return "Yes";
+ +}
+ +
+ +static ssize_t supported_show(struct kobject *kobj,
+ +                            struct kobj_attribute *attr, char *buf)
+ +{
+ +      return sprintf(buf, "%s\n", supported_printable(get_taint()));
+ +}
+ +KERNEL_ATTR_RO(supported);
+ +#endif
+ +
   static struct attribute * kernel_attrs[] = {
-       &fscaps_attr.attr,
   #if defined(CONFIG_HOTPLUG)
         &uevent_seqnum_attr.attr,
         &uevent_helper_attr.attr,
diff --cc kernel/module.c
Simple merge
diff --cc kernel/power/Kconfig

index 87f4d24,6de9a8f..255c0c6
--- 1/kernel/power/Kconfig
--- 2/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@@ -135,7 -141,7 +141,7 @@@ config PM_ADVANCED_DEBU
   
   config PM_TEST_SUSPEND
         bool "Test suspend/resume and wakealarm during bootup"
--      depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
++      depends on SUSPEND && PM_DEBUG && RTC_CLASS=y && !XEN_UNPRIVILEGED_GUEST
         ---help---
         This option will let you suspend your machine during bootup, and
         make it wake up a few seconds later using an RTC wakeup alarm.
@@@ -166,7 -172,7 +172,7 @@@ config PM_TRAC
   config PM_TRACE_RTC
         bool "Suspend/resume event tracing"
         depends on CAN_PM_TRACE
--      depends on X86
++      depends on X86 && !XEN_UNPRIVILEGED_GUEST
         select PM_TRACE
         ---help---
         This enables some cheesy code to save the last PM event point in the
diff --cc kernel/printk.c
Simple merge
diff --cc kernel/sched.c

index cbb3a0e,312f8b9..20eafe7
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -4288,30 -4161,6 +4161,12 @@@ need_resched
   EXPORT_SYMBOL(schedule);
   
   #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
++#include <asm/mutex.h>
+ +
- static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
- {
-       bool ret = false;
- 
-       rcu_read_lock();
-       if (lock->owner != owner)
-               goto fail;
- 
-       /*
-        * Ensure we emit the owner->on_cpu, dereference _after_ checking
-        * lock->owner still matches owner, if that fails, owner might
-        * point to free()d memory, if it still matches, the rcu_read_lock()
-        * ensures the memory stays valid.
-        */
-       barrier();
- 
-       ret = owner->on_cpu;
- fail:
-       rcu_read_unlock();
- 
-       return ret;
- }
++#ifndef arch_cpu_is_running
++#define arch_cpu_is_running(cpu) true
++#endif
+ +
   /*
    * Look out! "owner" is an entirely speculative pointer
    * access and not reliable.
@@@ -4321,20 -4173,58 +4179,59 @@@ int mutex_spin_on_owner(struct mutex *l
         if (!sched_feat(OWNER_SPIN))
                 return 0;
   
-       while (owner_running(lock, owner)) {
-               if (need_resched())
-                       return 0;
+ #ifdef CONFIG_DEBUG_PAGEALLOC
+       /*
+        * Need to access the cpu field knowing that
+        * DEBUG_PAGEALLOC could have unmapped it if
+        * the mutex owner just released it and exited.
+        */
+       if (probe_kernel_address(&owner->cpu, cpu))
+               return 0;
+ #else
+       cpu = owner->cpu;
+ #endif
   
-               arch_mutex_cpu_relax();
-       }
+       /*
+        * Even if the access succeeded (likely case),
+        * the cpu field may no longer be valid.
+        */
+       if (cpu >= nr_cpumask_bits)
+               return 0;
   
         /*
-        * If the owner changed to another task there is likely
-        * heavy contention, stop spinning.
+        * We need to validate that we can do a
+        * get_cpu() and that we have the percpu area.
          */
-       if (lock->owner)
+       if (!cpu_online(cpu))
                 return 0;
   
+       rq = cpu_rq(cpu);
+ 
+       for (;;) {
+               /*
+                * Owner changed, break to re-assess state.
+                */
+               if (lock->owner != owner) {
+                       /*
+                        * If the lock has switched to a different owner,
+                        * we likely have heavy contention. Return 0 to quit
+                        * optimistic spinning and not contend further:
+                        */
+                       if (lock->owner)
+                               return 0;
+                       break;
+               }
+ 
+               /*
+                * Is that owner really running on that cpu?
+                */
- -              if (task_thread_info(rq->curr) != owner || need_resched())
++              if (task_thread_info(rq->curr) != owner || need_resched()
++                  || !arch_cpu_is_running(cpu))
+                       return 0;
+ 
+               arch_mutex_cpu_relax();
+       }
+ 
         return 1;
   }
   #endif
diff --cc kernel/sysctl.c

index a8a366d,c0bb324..80772c1
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -855,7 -829,7 +845,7 @@@ static struct ctl_table kern_table[] = 
                 .proc_handler   = proc_dointvec,
         },
   #endif
--#if   defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
++#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) && !defined(CONFIG_ACPI_PV_SLEEP)
         {
                 .procname       = "acpi_video_flags",
                 .data           = &acpi_realmode_flags,
@@@ -1342,6 -1316,6 +1332,17 @@@ static struct ctl_table vm_table[] = 
                 .mode           = 0644,
                 .proc_handler   = scan_unevictable_handler,
         },
++#ifdef CONFIG_PRESWAP
++      {
++              .procname       = "preswap",
++              .data           = NULL,
++              .maxlen         = sizeof(unsigned long),
++              .mode           = 0644,
++              .proc_handler   = preswap_sysctl_handler,
++              .extra1         = (void *)&preswap_zero,
++              .extra2         = (void *)&preswap_infinity,
++      },
++#endif
   #ifdef CONFIG_MEMORY_FAILURE
         {
                 .procname       = "memory_failure_early_kill",
diff --cc kernel/sysctl_binary.c

index df4f94d,3b8e028..2078f4d
--- 1/kernel/sysctl_binary.c
--- 2/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@@ -873,6 -872,6 +873,15 @@@ static const struct bin_table bin_bus_t
   };
   
   
++#ifdef CONFIG_XEN
++#include <xen/sysctl.h>
++static const struct bin_table bin_xen_table[] = {
++      { CTL_INT,      CTL_XEN_INDEPENDENT_WALLCLOCK,  "independent_wallclock" },
++      { CTL_ULONG,    CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" },
++      {}
++};
++#endif
++
   static const struct bin_table bin_s390dbf_table[] = {
         { CTL_INT,      5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
         { CTL_INT,      5679 /* CTL_S390DBF_ACTIVE */,    "debug_active" },
@@@ -912,6 -911,6 +921,9 @@@ static const struct bin_table bin_root_
         { CTL_DIR,      CTL_BUS,        "bus",          bin_bus_table },
         { CTL_DIR,      CTL_ABI,        "abi" },
         /* CTL_CPU not used */
++#ifdef CONFIG_XEN
++      { CTL_DIR,      CTL_XEN,        "xen",          bin_xen_table },
++#endif
         /* CTL_ARLAN "arlan" no longer used */
         { CTL_DIR,      CTL_S390DBF,    "s390dbf",      bin_s390dbf_table },
         { CTL_DIR,      CTL_SUNRPC,     "sunrpc",       bin_sunrpc_table },
diff --cc lib/Kconfig.debug
Simple merge
diff --cc lib/swiotlb-xen.c

index 0000000,0000000..2edbb5d

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/lib/swiotlb-xen.c
@@@ -1,0 -1,0 +1,803 @@@
++/*
++ * Dynamic DMA mapping support.
++ *
++ * This implementation is a fallback for platforms that do not support
++ * I/O TLBs (aka DMA address translation hardware).
++ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
++ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
++ * Copyright (C) 2000, 2003 Hewlett-Packard Co
++ *    David Mosberger-Tang <davidm@hpl.hp.com>
++ * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
++ * 08/12/11 beckyb    Add highmem support
++ */
++
++#include <linux/cache.h>
++#include <linux/mm.h>
++#include <linux/module.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include <linux/string.h>
++#include <linux/swiotlb.h>
++#include <linux/pfn.h>
++#include <linux/types.h>
++#include <linux/ctype.h>
++#include <linux/init.h>
++#include <linux/bootmem.h>
++#include <linux/iommu-helper.h>
++#include <linux/highmem.h>
++#include <linux/gfp.h>
++
++#include <asm/io.h>
++#include <asm/pci.h>
++#include <asm/dma.h>
++#include <asm/uaccess.h>
++#include <xen/gnttab.h>
++#include <xen/interface/memory.h>
++#include <asm/gnttab_dma.h>
++
++#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
++
++int swiotlb;
++int swiotlb_force;
++
++/*
++ * Used to do a quick range check in swiotlb_tbl_unmap_single and
++ * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
++ * API.
++ */
++static char *io_tlb_start, *io_tlb_end;
++
++/*
++ * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
++ * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
++ */
++static unsigned long io_tlb_nslabs;
++
++/*
++ * When the IOMMU overflows we return a fallback buffer. This sets the size.
++ */
++static unsigned long io_tlb_overflow = 32*1024;
++
++static void *io_tlb_overflow_buffer;
++
++/*
++ * This is a free list describing the number of free entries available from
++ * each index
++ */
++static unsigned int *io_tlb_list;
++static unsigned int io_tlb_index;
++
++/*
++ * We need to save away the original address corresponding to a mapped entry
++ * for the sync operations.
++ */
++static phys_addr_t *io_tlb_orig_addr;
++
++/*
++ * Protect the above data structures in the map and unmap calls
++ */
++static DEFINE_SPINLOCK(io_tlb_lock);
++
++static unsigned int dma_bits;
++static unsigned int __initdata max_dma_bits = 32;
++static int __init
++setup_dma_bits(char *str)
++{
++      max_dma_bits = simple_strtoul(str, NULL, 0);
++      return 0;
++}
++__setup("dma_bits=", setup_dma_bits);
++
++static int __init
++setup_io_tlb_npages(char *str)
++{
++      /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
++      if (isdigit(*str)) {
++              io_tlb_nslabs = simple_strtoul(str, &str, 0) <<
++                      (20 - IO_TLB_SHIFT);
++              io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
++      }
++      if (*str == ',')
++              ++str;
++      /*
++         * NB. 'force' enables the swiotlb, but doesn't force its use for
++         * every DMA like it does on native Linux. 'off' forcibly disables
++         * use of the swiotlb.
++         */
++      if (!strcmp(str, "force"))
++              swiotlb_force = 1;
++      else if (!strcmp(str, "off"))
++              swiotlb_force = -1;
++
++      return 1;
++}
++__setup("swiotlb=", setup_io_tlb_npages);
++/* make io_tlb_overflow tunable too? */
++
++/* Note that this doesn't work with highmem page */
++static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
++                                    volatile void *address)
++{
++      return phys_to_dma(hwdev, virt_to_phys(address));
++}
++
++void swiotlb_print_info(void)
++{
++      unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
++
++      printk(KERN_INFO "Software IO TLB enabled: \n"
++             " Aperture:     %lu megabytes\n"
++             " Address size: %u bits\n"
++             " Kernel range: %p - %p\n",
++             bytes >> 20, dma_bits,
++             io_tlb_start, io_tlb_end);
++}
++
++void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
++{
++      unsigned long i, bytes;
++      int rc;
++
++      bytes = nslabs << IO_TLB_SHIFT;
++
++      io_tlb_nslabs = nslabs;
++      io_tlb_start = tlb;
++      dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
++      for (nslabs = 0; nslabs < io_tlb_nslabs; nslabs += IO_TLB_SEGSIZE) {
++              do {
++                      rc = xen_create_contiguous_region(
++                              (unsigned long)io_tlb_start + (nslabs << IO_TLB_SHIFT),
++                              get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
++                              dma_bits);
++              } while (rc && dma_bits++ < max_dma_bits);
++              if (rc) {
++                      if (nslabs == 0)
++                              panic("No suitable physical memory available for SWIOTLB buffer!\n"
++                                    "Use dom0_mem Xen boot parameter to reserve\n"
++                                    "some DMA memory (e.g., dom0_mem=-128M).\n");
++                      io_tlb_nslabs = nslabs;
++                      i = nslabs << IO_TLB_SHIFT;
++                      free_bootmem(__pa(io_tlb_start + i), bytes - i);
++                      bytes = i;
++                      for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) {
++                              unsigned int bits = fls64(virt_to_bus(io_tlb_start + i - 1));
++
++                              if (bits > dma_bits)
++                                      dma_bits = bits;
++                      }
++                      break;
++              }
++      }
++      io_tlb_end = io_tlb_start + bytes;
++
++      /*
++       * Allocate and initialize the free list array.  This array is used
++       * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
++       */
++      io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
++      for (i = 0; i < io_tlb_nslabs; i++)
++              io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
++      io_tlb_index = 0;
++      io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
++
++      /*
++       * Get the overflow emergency buffer
++       */
++      io_tlb_overflow_buffer = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_overflow));
++      if (!io_tlb_overflow_buffer)
++              panic("Cannot allocate SWIOTLB overflow buffer!\n");
++
++      do {
++              rc = xen_create_contiguous_region(
++                      (unsigned long)io_tlb_overflow_buffer,
++                      get_order(io_tlb_overflow),
++                      dma_bits);
++      } while (rc && dma_bits++ < max_dma_bits);
++      if (rc)
++              panic("No suitable physical memory available for SWIOTLB overflow buffer!\n");
++      if (verbose)
++              swiotlb_print_info();
++}
++
++/*
++ * Statically reserve bounce buffer space and initialize bounce buffer data
++ * structures for the software IO TLB used to implement the DMA API.
++ */
++void __init
++swiotlb_init_with_default_size(size_t default_size, int verbose)
++{
++      unsigned long bytes;
++
++      if (!io_tlb_nslabs) {
++              io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
++              io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
++      }
++
++      bytes = io_tlb_nslabs << IO_TLB_SHIFT;
++
++      /*
++       * Get IO TLB memory from the low pages
++       */
++      io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes));
++      if (!io_tlb_start)
++              panic("Cannot allocate SWIOTLB buffer");
++
++      swiotlb_init_with_tbl(io_tlb_start, io_tlb_nslabs, verbose);
++}
++
++void __init
++swiotlb_init(int verbose)
++{
++      unsigned long ram_end;
++      size_t defsz = 64 << 20; /* 64MB default size */
++
++      if (swiotlb_force == 1) {
++              swiotlb = 1;
++      } else if ((swiotlb_force != -1) &&
++                 is_running_on_xen() &&
++                 is_initial_xendomain()) {
++              /* Domain 0 always has a swiotlb. */
++              ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
++              if (ram_end <= 0x1ffff)
++                      defsz = 2 << 20; /* 2MB on <512MB systems. */
++              else if (ram_end <= 0x3ffff)
++                      defsz = 4 << 20; /* 4MB on <1GB systems. */
++              else if (ram_end <= 0x7ffff)
++                      defsz = 8 << 20; /* 8MB on <2GB systems. */
++              swiotlb = 1;
++      }
++
++      if (swiotlb)
++              swiotlb_init_with_default_size(defsz, verbose);
++      else
++              printk(KERN_INFO "Software IO TLB disabled\n");
++}
++
++static inline int range_needs_mapping(phys_addr_t pa, size_t size)
++{
++      return range_straddles_page_boundary(pa, size);
++}
++
++static int is_swiotlb_buffer(dma_addr_t addr)
++{
++      unsigned long pfn = mfn_to_local_pfn(PFN_DOWN(addr));
++      phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
++
++      return paddr >= virt_to_phys(io_tlb_start) &&
++              paddr < virt_to_phys(io_tlb_end);
++}
++
++/*
++ * Bounce: copy the swiotlb buffer back to the original dma location
++ *
++ * We use __copy_to_user_inatomic to transfer to the host buffer because the
++ * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
++ * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
++ * unnecessary copy from the aperture to the host buffer, and a page fault.
++ */
++void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
++                  enum dma_data_direction dir)
++{
++      unsigned long pfn = PFN_DOWN(phys);
++
++      if (PageHighMem(pfn_to_page(pfn))) {
++              /* The buffer does not have a mapping.  Map it in and copy */
++              unsigned int offset = phys & ~PAGE_MASK;
++              char *buffer;
++              unsigned int sz = 0;
++              unsigned long flags;
++
++              while (size) {
++                      sz = min_t(size_t, PAGE_SIZE - offset, size);
++
++                      local_irq_save(flags);
++                      buffer = kmap_atomic(pfn_to_page(pfn),
++                                           KM_BOUNCE_READ);
++                      if (dir == DMA_TO_DEVICE)
++                              memcpy(dma_addr, buffer + offset, sz);
++                      else if (__copy_to_user_inatomic(buffer + offset,
++                                                       dma_addr, sz))
++                              /* inaccessible */;
++                      kunmap_atomic(buffer, KM_BOUNCE_READ);
++                      local_irq_restore(flags);
++
++                      size -= sz;
++                      pfn++;
++                      dma_addr += sz;
++                      offset = 0;
++              }
++      } else {
++              if (dir == DMA_TO_DEVICE)
++                      memcpy(dma_addr, phys_to_virt(phys), size);
++              else if (__copy_to_user_inatomic(phys_to_virt(phys),
++                                               dma_addr, size))
++                      /* inaccessible */;
++      }
++}
++EXPORT_SYMBOL_GPL(swiotlb_bounce);
++
++void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
++                           phys_addr_t phys, size_t size,
++                           enum dma_data_direction dir)
++{
++      unsigned long flags;
++      char *dma_addr;
++      unsigned int nslots, stride, index, wrap;
++      int i;
++      unsigned long mask;
++      unsigned long offset_slots;
++      unsigned long max_slots;
++
++      mask = dma_get_seg_boundary(hwdev);
++      offset_slots = -IO_TLB_SEGSIZE;
++
++      /*
++       * Carefully handle integer overflow which can occur when mask == ~0UL.
++       */
++      max_slots = mask + 1
++                  ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
++                  : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
++
++      /*
++       * For mappings greater than a page, we limit the stride (and
++       * hence alignment) to a page size.
++       */
++      nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
++      if (size > PAGE_SIZE)
++              stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
++      else
++              stride = 1;
++
++      BUG_ON(!nslots);
++
++      /*
++       * Find suitable number of IO TLB entries size that will fit this
++       * request and allocate a buffer from that IO TLB pool.
++       */
++      spin_lock_irqsave(&io_tlb_lock, flags);
++      index = ALIGN(io_tlb_index, stride);
++      if (index >= io_tlb_nslabs)
++              index = 0;
++      wrap = index;
++
++      do {
++              while (iommu_is_span_boundary(index, nslots, offset_slots,
++                                            max_slots)) {
++                      index += stride;
++                      if (index >= io_tlb_nslabs)
++                              index = 0;
++                      if (index == wrap)
++                              goto not_found;
++              }
++
++              /*
++               * If we find a slot that indicates we have 'nslots' number of
++               * contiguous buffers, we allocate the buffers from that slot
++               * and mark the entries as '0' indicating unavailable.
++               */
++              if (io_tlb_list[index] >= nslots) {
++                      int count = 0;
++
++                      for (i = index; i < (int) (index + nslots); i++)
++                              io_tlb_list[i] = 0;
++                      for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
++                              io_tlb_list[i] = ++count;
++                      dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
++
++                      /*
++                       * Update the indices to avoid searching in the next
++                       * round.
++                       */
++                      io_tlb_index = ((index + nslots) < io_tlb_nslabs
++                                      ? (index + nslots) : 0);
++
++                      goto found;
++              }
++              index += stride;
++              if (index >= io_tlb_nslabs)
++                      index = 0;
++      } while (index != wrap);
++
++not_found:
++      spin_unlock_irqrestore(&io_tlb_lock, flags);
++      return NULL;
++found:
++      spin_unlock_irqrestore(&io_tlb_lock, flags);
++
++      /*
++       * Save away the mapping from the original address to the DMA address.
++       * This is needed when we sync the memory.  Then we sync the buffer if
++       * needed.
++       */
++      for (i = 0; i < nslots; i++)
++              io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
++      if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
++              swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
++
++      return dma_addr;
++}
++EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
++
++/*
++ * Allocates bounce buffer and returns its kernel virtual address.
++ */
++
++static void *
++map_single(struct device *hwdev, phys_addr_t phys, size_t size,
++         enum dma_data_direction dir)
++{
++      dma_addr_t start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
++
++      return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir);
++}
++
++/*
++ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
++ */
++void
++swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
++                      enum dma_data_direction dir)
++{
++      unsigned long flags;
++      int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
++      int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
++      phys_addr_t phys = io_tlb_orig_addr[index];
++
++      /*
++       * First, sync the memory before unmapping the entry
++       */
++      if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
++              swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
++
++      /*
++       * Return the buffer to the free list by setting the corresponding
++       * entries to indicate the number of contiguous entries available.
++       * While returning the entries to the free list, we merge the entries
++       * with slots below and above the pool being returned.
++       */
++      spin_lock_irqsave(&io_tlb_lock, flags);
++      {
++              count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
++                       io_tlb_list[index + nslots] : 0);
++              /*
++               * Step 1: return the slots to the free list, merging the
++               * slots with superceeding slots
++               */
++              for (i = index + nslots - 1; i >= index; i--)
++                      io_tlb_list[i] = ++count;
++              /*
++               * Step 2: merge the returned slots with the preceding slots,
++               * if available (non zero)
++               */
++              for (i = index - 1;
++                   (OFFSET(i, IO_TLB_SEGSIZE) !=
++                    IO_TLB_SEGSIZE -1) && io_tlb_list[i];
++                   i--)
++                      io_tlb_list[i] = ++count;
++      }
++      spin_unlock_irqrestore(&io_tlb_lock, flags);
++}
++EXPORT_SYMBOL_GPL(swiotlb_tbl_unmap_single);
++
++void
++swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size,
++                      enum dma_data_direction dir,
++                      enum dma_sync_target target)
++{
++      int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
++      phys_addr_t phys = io_tlb_orig_addr[index];
++
++      phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
++
++      switch (target) {
++      case SYNC_FOR_CPU:
++              if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
++                      swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
++              else
++                      BUG_ON(dir != DMA_TO_DEVICE);
++              break;
++      case SYNC_FOR_DEVICE:
++              if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
++                      swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
++              else
++                      BUG_ON(dir != DMA_FROM_DEVICE);
++              break;
++      default:
++              BUG();
++      }
++}
++EXPORT_SYMBOL_GPL(swiotlb_tbl_sync_single);
++
++static void
++swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
++           int do_panic)
++{
++      /*
++       * Ran out of IOMMU space for this operation. This is very bad.
++       * Unfortunately the drivers cannot handle this operation properly.
++       * unless they check for pci_dma_mapping_error (most don't)
++       * When the mapping is small enough return a static buffer to limit
++       * the damage, or panic when the transfer is too big.
++       */
++      printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %zu bytes at "
++             "device %s\n", size, dev ? dev_name(dev) : "?");
++
++      if (size <= io_tlb_overflow || !do_panic)
++              return;
++
++      if (dir == DMA_BIDIRECTIONAL)
++              panic("DMA: Random memory could be DMA accessed\n");
++      if (dir == DMA_FROM_DEVICE)
++              panic("DMA: Random memory could be DMA written\n");
++      if (dir == DMA_TO_DEVICE)
++              panic("DMA: Random memory could be DMA read\n");
++}
++
++/*
++ * Map a single buffer of the indicated size for DMA in streaming mode.  The
++ * PCI address to use is returned.
++ *
++ * Once the device is given the dma address, the device owns this memory until
++ * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
++ */
++dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
++                          unsigned long offset, size_t size,
++                          enum dma_data_direction dir,
++                          struct dma_attrs *attrs)
++{
++      phys_addr_t phys = page_to_pseudophys(page) + offset;
++      dma_addr_t dev_addr = gnttab_dma_map_page(page) + offset;
++      void *map;
++
++      BUG_ON(dir == DMA_NONE);
++
++      /*
++       * If the address happens to be in the device's DMA window,
++       * we can safely return the device addr and not worry about bounce
++       * buffering it.
++       */
++      if (dma_capable(dev, dev_addr, size) &&
++          !range_needs_mapping(phys, size))
++              return dev_addr;
++
++      /*
++       * Oh well, have to allocate and map a bounce buffer.
++       */
++      gnttab_dma_unmap_page(dev_addr);
++      map = map_single(dev, phys, size, dir);
++      if (!map) {
++              swiotlb_full(dev, size, dir, 1);
++              map = io_tlb_overflow_buffer;
++      }
++
++      dev_addr = swiotlb_virt_to_bus(dev, map);
++
++      /*
++       * Ensure that the address returned is DMA'ble
++       */
++      if (!dma_capable(dev, dev_addr, size)) {
++              swiotlb_tbl_unmap_single(dev, map, size, dir);
++              dev_addr = swiotlb_virt_to_bus(dev, io_tlb_overflow_buffer);
++      }
++
++      return dev_addr;
++}
++EXPORT_SYMBOL_GPL(swiotlb_map_page);
++
++/*
++ * Unmap a single streaming mode DMA translation.  The dma_addr and size must
++ * match what was provided for in a previous swiotlb_map_page call.  All
++ * other usages are undefined.
++ *
++ * After this call, reads by the cpu to the buffer are guaranteed to see
++ * whatever the device wrote there.
++ */
++static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
++                       size_t size, enum dma_data_direction dir)
++{
++      phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
++
++      BUG_ON(dir == DMA_NONE);
++
++      if (is_swiotlb_buffer(dev_addr)) {
++              swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
++              return;
++      }
++
++      gnttab_dma_unmap_page(dev_addr);
++}
++
++void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
++                      size_t size, enum dma_data_direction dir,
++                      struct dma_attrs *attrs)
++{
++      unmap_single(hwdev, dev_addr, size, dir);
++}
++EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
++
++/*
++ * Make physical memory consistent for a single streaming mode DMA translation
++ * after a transfer.
++ *
++ * If you perform a swiotlb_map_page() but wish to interrogate the buffer
++ * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
++ * call this function before doing so.  At the next point you give the PCI dma
++ * address back to the card, you must first perform a
++ * swiotlb_dma_sync_for_device, and then the device again owns the buffer
++ */
++static void
++swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
++                  size_t size, enum dma_data_direction dir,
++                  enum dma_sync_target target)
++{
++      phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
++
++      BUG_ON(dir == DMA_NONE);
++
++      if (is_swiotlb_buffer(dev_addr))
++              swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
++                                     target);
++}
++
++void
++swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
++                          size_t size, enum dma_data_direction dir)
++{
++      swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
++}
++EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
++
++void
++swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
++                             size_t size, enum dma_data_direction dir)
++{
++      swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
++}
++EXPORT_SYMBOL(swiotlb_sync_single_for_device);
++
++/*
++ * Map a set of buffers described by scatterlist in streaming mode for DMA.
++ * This is the scatter-gather version of the above swiotlb_map_page
++ * interface.  Here the scatter gather list elements are each tagged with the
++ * appropriate dma address and length.  They are obtained via
++ * sg_dma_{address,length}(SG).
++ *
++ * NOTE: An implementation may be able to use a smaller number of
++ *       DMA address/length pairs than there are SG table elements.
++ *       (for example via virtual mapping capabilities)
++ *       The routine returns the number of addr/length pairs actually
++ *       used, at most nents.
++ *
++ * Device ownership issues as mentioned above for swiotlb_map_page are the
++ * same here.
++ */
++int
++swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
++                   enum dma_data_direction dir, struct dma_attrs *attrs)
++{
++      struct scatterlist *sg;
++      int i;
++
++      BUG_ON(dir == DMA_NONE);
++
++      for_each_sg(sgl, sg, nelems, i) {
++              dma_addr_t dev_addr = gnttab_dma_map_page(sg_page(sg))
++                                    + sg->offset;
++              phys_addr_t paddr = page_to_pseudophys(sg_page(sg))
++                                 + sg->offset;
++
++              if (range_needs_mapping(paddr, sg->length) ||
++                  !dma_capable(hwdev, dev_addr, sg->length)) {
++                      void *map;
++
++                      gnttab_dma_unmap_page(dev_addr);
++                      map = map_single(hwdev, paddr,
++                                       sg->length, dir);
++                      if (!map) {
++                              /* Don't panic here, we expect map_sg users
++                                 to do proper error handling. */
++                              swiotlb_full(hwdev, sg->length, dir, 0);
++                              swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
++                                                     attrs);
++                              sgl[0].dma_length = 0;
++                              return 0;
++                      }
++                      sg->dma_address = swiotlb_virt_to_bus(hwdev, map);
++              } else
++                      sg->dma_address = dev_addr;
++              sg->dma_length = sg->length;
++      }
++      return nelems;
++}
++EXPORT_SYMBOL(swiotlb_map_sg_attrs);
++
++int
++swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
++             enum dma_data_direction dir)
++{
++      return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
++}
++EXPORT_SYMBOL(swiotlb_map_sg);
++
++/*
++ * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
++ * concerning calls here are the same as for swiotlb_unmap_page() above.
++ */
++void
++swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
++                     int nelems, enum dma_data_direction dir, struct dma_attrs *attrs)
++{
++      struct scatterlist *sg;
++      int i;
++
++      BUG_ON(dir == DMA_NONE);
++
++      for_each_sg(sgl, sg, nelems, i)
++              unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
++
++}
++EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
++
++void
++swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
++               enum dma_data_direction dir)
++{
++      return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
++}
++EXPORT_SYMBOL(swiotlb_unmap_sg);
++
++/*
++ * Make physical memory consistent for a set of streaming mode DMA translations
++ * after a transfer.
++ *
++ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
++ * and usage.
++ */
++static void
++swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
++              int nelems, enum dma_data_direction dir,
++              enum dma_sync_target target)
++{
++      struct scatterlist *sg;
++      int i;
++
++      for_each_sg(sgl, sg, nelems, i)
++              swiotlb_sync_single(hwdev, sg->dma_address,
++                                  sg->dma_length, dir, target);
++}
++
++void
++swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
++                      int nelems, enum dma_data_direction dir)
++{
++      swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
++}
++EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
++
++void
++swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
++                         int nelems, enum dma_data_direction dir)
++{
++      swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
++}
++EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
++
++int
++swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
++{
++      return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer));
++}
++EXPORT_SYMBOL(swiotlb_dma_mapping_error);
++
++/*
++ * Return whether the given PCI device DMA address mask can be supported
++ * properly.  For example, if your device can only drive the low 24-bits
++ * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
++ * this function.
++ */
++int
++swiotlb_dma_supported (struct device *hwdev, u64 mask)
++{
++      return (mask >= ((1UL << dma_bits) - 1));
++}
++EXPORT_SYMBOL(swiotlb_dma_supported);
diff --cc mm/Kconfig

index 8ca47a5,e9c0c61..ae29ea6
--- 1/mm/Kconfig
--- 2/mm/Kconfig
+++ b/mm/Kconfig
@@@ -304,7 -304,7 +304,7 @@@ config NOMMU_INITIAL_TRIM_EXCES
   
   config TRANSPARENT_HUGEPAGE
         bool "Transparent Hugepage Support"
--      depends on X86 && MMU
++      depends on X86 && !XEN && MMU
         select COMPACTION
         help
           Transparent Hugepages allows the kernel to use huge pages and
@@@ -347,26 -347,3 +347,31 @@@ config NEED_PER_CPU_K
         depends on !SMP
         bool
         default y
+ +
- config CLEANCACHE
-       bool "Enable cleancache driver to cache clean pages if tmem is present"
-       default n
++#
++# support for transcendent memory
++#
++config TMEM
++      bool
++      help
++        In a virtualized environment, allows unused and underutilized
++        system physical memory to be made accessible through a narrow
++        well-defined page-copy-based API.  If unsure, say Y.
++
++config PRECACHE
++      bool "Cache clean pages in transcendent memory"
++      depends on XEN
++      select TMEM
++      help
++        Allows the transcendent memory pool to be used to store clean
++        page-cache pages which, under some circumstances, will greatly
++        reduce paging and thus improve performance.  If unsure, say Y.
++
++config PRESWAP
++      bool "Swap pages to transcendent memory"
++      depends on XEN
++      select TMEM
+ +      help
-         Cleancache can be thought of as a page-granularity victim cache
-         for clean pages that the kernel's pageframe replacement algorithm
-         (PFRA) would like to keep around, but can't since there isn't enough
-         memory.  So when the PFRA "evicts" a page, it first attempts to use
-         cleancacne code to put the data contained in that page into
-         "transcendent memory", memory that is not directly accessible or
-         addressable by the kernel and is of unknown and possibly
-         time-varying size.  And when a cleancache-enabled
-         filesystem wishes to access a page in a file on disk, it first
-         checks cleancache to see if it already contains it; if it does,
-         the page is copied into the kernel and a disk access is avoided.
-         When a transcendent memory driver is available (such as zcache or
-         Xen transcendent memory), a significant I/O reduction
-         may be achieved.  When none is available, all cleancache calls
-         are reduced to a single pointer-compare-against-NULL resulting
-         in a negligible performance hit.
- 
-         If unsure, say Y to enable cleancache
++        Allows the transcendent memory pool to be used as a pseudo-swap
++        device which, under some circumstances, will greatly reduce
++        swapping and thus improve performance.  If unsure, say Y.
diff --cc mm/Makefile

index 836e416,42a8326..ac62a1c
--- 1/mm/Makefile
--- 2/mm/Makefile
+++ b/mm/Makefile
@@@ -25,6 -25,6 +25,9 @@@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock
   
   obj-$(CONFIG_BOUNCE)  += bounce.o
   obj-$(CONFIG_SWAP)    += page_io.o swap_state.o swapfile.o thrash.o
++obj-$(CONFIG_TMEM)    += tmem.o
++obj-$(CONFIG_PRESWAP) += preswap.o
++obj-$(CONFIG_PRECACHE)        += precache.o
   obj-$(CONFIG_HAS_DMA) += dmapool.o
   obj-$(CONFIG_HUGETLBFS)       += hugetlb.o
   obj-$(CONFIG_NUMA)    += mempolicy.o
diff --cc mm/filemap.c

index d7b1057,c641edf..61b8408
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -33,8 -33,7 +33,8 @@@
   #include <linux/cpuset.h>
   #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
   #include <linux/memcontrol.h>
++#include <linux/precache.h>
   #include <linux/mm_inline.h> /* for page_is_file_cache() */
- #include <linux/cleancache.h>
   #include "internal.h"
   
   /*
@@@ -119,16 -118,6 +119,16 @@@ void __delete_from_page_cache(struct pa
   {
         struct address_space *mapping = page->mapping;
   
+ +      /*
-        * if we're uptodate, flush out into the cleancache, otherwise
-        * invalidate any existing cleancache entries.  We can't leave
-        * stale data around in the cleancache once our page is gone
++       * if we're uptodate, flush out into the precache, otherwise
++       * invalidate any existing precache entries.  We can't leave
++       * stale data around in the precache once our page is gone
+ +       */
+ +      if (PageUptodate(page) && PageMappedToDisk(page))
-               cleancache_put_page(page);
++              precache_put(mapping, page->index, page);
+ +      else
-               cleancache_flush_page(mapping, page);
++              precache_flush(mapping, page->index);
+ +
         radix_tree_delete(&mapping->page_tree, page->index);
         page->mapping = NULL;
         mapping->nrpages--;
diff --cc mm/init-mm.c

index 4019979,1d29cdf..78de859
--- 1/mm/init-mm.c
--- 2/mm/init-mm.c
+++ b/mm/init-mm.c
@@@ -13,6 -13,6 +13,10 @@@
   #define INIT_MM_CONTEXT(name)
   #endif
   
++#ifdef CONFIG_X86_XEN
++#define swapper_pg_dir ((pgd_t *)NULL)
++#endif
++
   struct mm_struct init_mm = {
         .mm_rb          = RB_ROOT,
         .pgd            = swapper_pg_dir,
diff --cc mm/memory.c

index 6953d39,61e66f0..9edd166
--- 1/mm/memory.c
--- 2/mm/memory.c
+++ b/mm/memory.c
@@@ -801,6 -604,6 +604,12 @@@ struct page *vm_normal_page(struct vm_a
   {
         unsigned long pfn = pte_pfn(pte);
   
++#if defined(CONFIG_XEN) && defined(CONFIG_X86)
++      /* XEN: Covers user-space grant mappings (even of local pages). */
++      if (unlikely(vma->vm_flags & VM_FOREIGN))
++              return NULL;
++#endif
++
         if (HAVE_PTE_SPECIAL) {
                 if (likely(!pte_special(pte)))
                         goto check_pfn;
@@@ -832,6 -635,6 +641,9 @@@
                 return NULL;
   check_pfn:
         if (unlikely(pfn > highest_memmap_pfn)) {
++#ifdef CONFIG_XEN
++              if (!(vma->vm_flags & VM_RESERVED))
++#endif
                 print_bad_pte(vma, addr, pte, NULL);
                 return NULL;
         }
@@@ -1146,8 -951,8 +960,14 @@@ static unsigned long zap_pte_range(stru
                                      page->index > details->last_index))
                                         continue;
                         }
--                      ptent = ptep_get_and_clear_full(mm, addr, pte,
--                                                      tlb->fullmm);
++#ifdef CONFIG_XEN
++                      if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte))
++                              ptent = vma->vm_ops->zap_pte(vma, addr, pte,
++                                                           tlb->fullmm);
++                      else
++#endif
++                              ptent = ptep_get_and_clear_full(mm, addr, pte,
++                                                              tlb->fullmm);
                         tlb_remove_tlb_entry(tlb, pte, addr);
                         if (unlikely(!page))
                                 continue;
@@@ -1387,12 -1222,13 +1237,14 @@@ unsigned long zap_page_range(struct vm_
         unsigned long nr_accounted = 0;
   
         lru_add_drain();
-       tlb_gather_mmu(&tlb, mm, 0);
+       tlb = tlb_gather_mmu(mm, 0);
         update_hiwater_rss(mm);
         end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
-       tlb_finish_mmu(&tlb, address, end);
+       if (tlb)
+               tlb_finish_mmu(tlb, address, end);
         return end;
   }
++EXPORT_SYMBOL(zap_page_range);
   
   /**
    * zap_vma_ptes - remove ptes mapping the vma
@@@ -1703,6 -1539,6 +1555,28 @@@ int __get_user_pages(struct task_struc
                         goto next_page;
                 }
   
++#ifdef CONFIG_XEN
++              if (vma && (vma->vm_flags & VM_FOREIGN)) {
++                      struct vm_foreign_map *foreign_map =
++                              vma->vm_private_data;
++                      struct page **map = foreign_map->map;
++                      int offset = (start - vma->vm_start) >> PAGE_SHIFT;
++                      if (map[offset] != NULL) {
++                              if (pages) {
++                                      struct page *page = map[offset];
++
++                                      pages[i] = page;
++                                      get_page(page);
++                              }
++                              if (vmas)
++                                      vmas[i] = vma;
++                              i++;
++                              start += PAGE_SIZE;
++                              nr_pages--;
++                              continue;
++                      }
++              }
++#endif
                 if (!vma ||
                     (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
                     !(vm_flags & vma->vm_flags))
@@@ -2333,6 -2169,6 +2207,10 @@@ int apply_to_page_range(struct mm_struc
         unsigned long end = addr + size;
         int err;
   
++#ifdef CONFIG_XEN
++      if (!mm)
++              mm = &init_mm;
++#endif
         BUG_ON(addr >= end);
         pgd = pgd_offset(mm, addr);
         do {
diff --cc mm/mmap.c

index bbdc9af,772140c..58c93ff
--- 1/mm/mmap.c
--- 2/mm/mmap.c
+++ b/mm/mmap.c
@@@ -1900,11 -1927,11 +1927,19 @@@ static void unmap_region(struct mm_stru
         update_hiwater_rss(mm);
         unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
         vm_unacct_memory(nr_accounted);
-       free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
-                                next ? next->vm_start : 0);
-       tlb_finish_mmu(&tlb, start, end);
+       free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+                                next? next->vm_start: 0);
+       tlb_finish_mmu(tlb, start, end);
+ }
+ 
++static inline void unmap_vma(struct vm_area_struct *vma)
++{
++#ifdef CONFIG_XEN
++      if (unlikely(vma->vm_ops && vma->vm_ops->unmap))
++              vma->vm_ops->unmap(vma);
++#endif
+ +}
+ +
   /*
    * Create a list of vma's touched by the unmap, removing them from the mm's
    * vma list as we go..
@@@ -1921,6 -1948,6 +1956,7 @@@ detach_vmas_to_be_unmapped(struct mm_st
         vma->vm_prev = NULL;
         do {
                 rb_erase(&vma->vm_rb, &mm->mm_rb);
++              unmap_vma(vma);
                 mm->map_count--;
                 tail_vma = vma;
                 vma = vma->vm_next;
@@@ -2263,6 -2290,6 +2299,11 @@@ void exit_mmap(struct mm_struct *mm
   
         arch_exit_mmap(mm);
   
++#ifdef CONFIG_XEN
++      for (vma = mm->mmap; vma; vma = vma->vm_next)
++              unmap_vma(vma);
++#endif
++
         vma = mm->mmap;
         if (!vma)       /* Can happen if dup_mmap() received an OOM */
                 return;
diff --cc mm/page_alloc.c

index 73d5fa0,3f8bce2..f4bf704
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -653,6 -650,6 +650,13 @@@ static bool free_pages_prepare(struct p
         int i;
         int bad = 0;
   
++#ifdef CONFIG_XEN
++      if (PageForeign(page)) {
++              PageForeignDestructor(page, order);
++              return false;
++      }
++#endif
++
         trace_mm_page_free_direct(page, order);
         kmemcheck_free_shadow(page, order);
   
@@@ -679,6 -676,6 +683,9 @@@ static void __free_pages_ok(struct pag
         unsigned long flags;
         int wasMlocked = __TestClearPageMlocked(page);
   
++#ifdef CONFIG_XEN
++      WARN_ON(PageForeign(page) && wasMlocked);
++#endif
         if (!free_pages_prepare(page, order))
                 return;
   
@@@ -1169,6 -1166,6 +1176,9 @@@ void free_hot_cold_page(struct page *pa
         int migratetype;
         int wasMlocked = __TestClearPageMlocked(page);
   
++#ifdef CONFIG_XEN
++      WARN_ON(PageForeign(page) && wasMlocked);
++#endif
         if (!free_pages_prepare(page, 0))
                 return;
   
@@@ -2224,7 -2176,27 +2189,33 @@@ rebalance
         }
   
   nopage:
-       warn_alloc_failed(gfp_mask, order, NULL);
+       if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+               unsigned int filter = SHOW_MEM_FILTER_NODES;
+ 
+               /*
+                * This documents exceptions given to allocations in certain
+                * contexts that are allowed to allocate outside current's set
+                * of allowed nodes.
+                */
+               if (!(gfp_mask & __GFP_NOMEMALLOC))
+                       if (test_thread_flag(TIF_MEMDIE) ||
+                           (current->flags & (PF_MEMALLOC | PF_EXITING)))
+                               filter &= ~SHOW_MEM_FILTER_NODES;
+               if (in_interrupt() || !wait)
+                       filter &= ~SHOW_MEM_FILTER_NODES;
+ 
- -              pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n",
++              if (!wait) {
++                      pr_info("The following is only an harmless informational message.\n");
++                      pr_info("Unless you get a _continuous_flood_ of these messages it means\n");
++                      pr_info("everything is working fine. Allocations from irqs cannot be\n");
++                      pr_info("perfectly reliable and the kernel is designed to handle that.\n");
++              }
++              pr_info("%s: page allocation failure. order:%d, mode:0x%x\n",
+                       current->comm, order, gfp_mask);
+               dump_stack();
+               if (!should_suppress_show_mem())
+                       show_mem(filter);
+       }
         return page;
   got_pg:
         if (kmemcheck_enabled)
@@@ -5113,6 -5074,6 +5093,22 @@@ void setup_per_zone_wmarks(void
                 spin_unlock_irqrestore(&zone->lock, flags);
         }
   
++#ifdef CONFIG_XEN
++      for_each_populated_zone(zone) {
++              unsigned int cpu;
++
++              for_each_online_cpu(cpu) {
++                      unsigned long high;
++
++                      high = percpu_pagelist_fraction
++                             ? zone->present_pages / percpu_pagelist_fraction
++                             : 5 * zone_batchsize(zone);
++                      setup_pagelist_highmark(
++                              per_cpu_ptr(zone->pageset, cpu), high);
++              }
++      }
++#endif
++
         /* update totalreserve_pages */
         calculate_totalreserve_pages();
   }
diff --cc mm/page_io.c

index dc76b4d,dc76b4d..5ee0adb
--- 1/mm/page_io.c
--- 2/mm/page_io.c
+++ b/mm/page_io.c
@@@ -98,6 -98,6 +98,14 @@@ int swap_writepage(struct page *page, s
                 unlock_page(page);
                 goto out;
         }
++
++      if (preswap_put(page) == 1) {
++              set_page_writeback(page);
++              unlock_page(page);
++              end_page_writeback(page);
++              goto out;
++      }
++
         bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
         if (bio == NULL) {
                 set_page_dirty(page);
@@@ -122,6 -122,6 +130,13 @@@ int swap_readpage(struct page *page
   
         VM_BUG_ON(!PageLocked(page));
         VM_BUG_ON(PageUptodate(page));
++
++      if (preswap_get(page) == 1) {
++              SetPageUptodate(page);
++              unlock_page(page);
++              goto out;
++      }
++
         bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
         if (bio == NULL) {
                 unlock_page(page);
diff --cc mm/precache.c

index 0000000,0000000..6fe975e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/mm/precache.c
@@@ -1,0 -1,0 +1,179 @@@
++/*
++ * linux/mm/precache.c
++ *
++ * Implements "precache" for filesystems/pagecache on top of transcendent
++ * memory ("tmem") API.  A filesystem creates an "ephemeral tmem pool"
++ * and retains the returned pool_id in its superblock.  Clean pages evicted
++ * from pagecache may be "put" into the pool and associated with a "handle"
++ * consisting of the pool_id, an object (inode) id, and an index (page offset).
++ * Note that the page is copied to tmem; no kernel mappings are changed.
++ * If the page is later needed, the filesystem (or VFS) issues a "get", passing
++ * the same handle and an empty pageframe.  If successful, the page is copied
++ * into the pageframe and a disk read is avoided.  But since the tmem pool
++ * is of indeterminate size, a "put" page has indeterminate longevity
++ * ("ephemeral"), and the "get" may fail, in which case the filesystem must
++ * read the page from disk as before.  Note that the filesystem/pagecache are
++ * responsible for maintaining coherency between the pagecache, precache,
++ * and the disk, for which "flush page" and "flush object" actions are
++ * provided.  And when a filesystem is unmounted, it must "destroy" the pool.
++ *
++ * Two types of pools may be created for a precache: "private" or "shared".
++ * For a private pool, a successful "get" always flushes, implementing
++ * exclusive semantics; for a "shared" pool (which is intended for use by
++ * co-resident nodes of a cluster filesystem), the "flush" is not guaranteed.
++ * In either case, a failed "duplicate" put (overwrite) always guarantee
++ * the old data is flushed.
++ *
++ * Note also that multiple accesses to a tmem pool may be concurrent and any
++ * ordering must be guaranteed by the caller.
++ *
++ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
++ */
++
++#include <linux/precache.h>
++#include <linux/exportfs.h>
++#include <linux/module.h>
++#include "tmem.h"
++
++static int precache_auto_allocate; /* set to 1 to auto_allocate */
++
++union precache_filekey {
++      struct tmem_oid oid;
++      u32 fh[0];
++};
++
++/*
++ * If the filesystem uses exportable filehandles, use the filehandle as
++ * the key, else use the inode number.
++ */
++static int precache_get_key(struct inode *inode, union precache_filekey *key)
++{
++#define PRECACHE_KEY_MAX (sizeof(key->oid) / sizeof(*key->fh))
++      struct super_block *sb = inode->i_sb;
++
++      memset(key, 0, sizeof(key));
++      if (sb->s_export_op) {
++              int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
++
++              fhfn = sb->s_export_op->encode_fh;
++              if (fhfn) {
++                      struct dentry *d;
++                      int ret, maxlen = PRECACHE_KEY_MAX;
++
++                      d = list_first_entry(&inode->i_dentry,
++                                           struct dentry, d_alias);
++                      ret = fhfn(d, key->fh, &maxlen, 0);
++                      if (ret < 0)
++                              return ret;
++                      if (ret >= 255 || maxlen > PRECACHE_KEY_MAX)
++                              return -EPERM;
++                      if (maxlen > 0)
++                              return 0;
++              }
++      }
++      key->oid.oid[0] = inode->i_ino;
++      key->oid.oid[1] = inode->i_generation;
++      return 0;
++#undef PRECACHE_KEY_MAX
++}
++
++int precache_put(struct address_space *mapping, unsigned long index,
++               struct page *page)
++{
++      u32 tmem_pool = mapping->host->i_sb->precache_poolid;
++      union precache_filekey key;
++      u32 ind = (u32) index;
++      unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
++      int ret;
++
++      if ((s32)tmem_pool < 0) {
++              if (!precache_auto_allocate)
++                      return 0;
++              /* a put on a non-existent precache may auto-allocate one */
++              ret = tmem_new_pool(0, 0, 0);
++              if (ret < 0)
++                      return 0;
++              pr_info("Mapping superblock for s_id=%s to precache_id=%d\n",
++                      mapping->host->i_sb->s_id, tmem_pool);
++              mapping->host->i_sb->precache_poolid = tmem_pool;
++      }
++      if (ind != index || precache_get_key(mapping->host, &key))
++              return 0;
++      mb(); /* ensure page is quiescent; tmem may address it with an alias */
++      return tmem_put_page(tmem_pool, key.oid, ind, mfn);
++}
++
++int precache_get(struct address_space *mapping, unsigned long index,
++               struct page *empty_page)
++{
++      u32 tmem_pool = mapping->host->i_sb->precache_poolid;
++      union precache_filekey key;
++      u32 ind = (u32) index;
++      unsigned long mfn = pfn_to_mfn(page_to_pfn(empty_page));
++
++      if ((s32)tmem_pool < 0)
++              return 0;
++      if (ind != index || precache_get_key(mapping->host, &key))
++              return 0;
++
++      return tmem_get_page(tmem_pool, key.oid, ind, mfn);
++}
++EXPORT_SYMBOL(precache_get);
++
++int precache_flush(struct address_space *mapping, unsigned long index)
++{
++      u32 tmem_pool = mapping->host->i_sb->precache_poolid;
++      union precache_filekey key;
++      u32 ind = (u32) index;
++
++      if ((s32)tmem_pool < 0)
++              return 0;
++      if (ind != index || precache_get_key(mapping->host, &key))
++              return 0;
++
++      return tmem_flush_page(tmem_pool, key.oid, ind);
++}
++EXPORT_SYMBOL(precache_flush);
++
++int precache_flush_inode(struct address_space *mapping)
++{
++      u32 tmem_pool = mapping->host->i_sb->precache_poolid;
++      union precache_filekey key;
++
++      if ((s32)tmem_pool < 0 || precache_get_key(mapping->host, &key))
++              return 0;
++
++      return tmem_flush_object(tmem_pool, key.oid);
++}
++EXPORT_SYMBOL(precache_flush_inode);
++
++int precache_flush_filesystem(struct super_block *sb)
++{
++      u32 tmem_pool = sb->precache_poolid;
++      int ret;
++
++      if ((s32)tmem_pool < 0)
++              return 0;
++      ret = tmem_destroy_pool(tmem_pool);
++      if (!ret)
++              return 0;
++      pr_info("Unmapping superblock for s_id=%s from precache_id=%d\n",
++              sb->s_id, ret);
++      sb->precache_poolid = 0;
++      return 1;
++}
++EXPORT_SYMBOL(precache_flush_filesystem);
++
++void precache_init(struct super_block *sb)
++{
++      sb->precache_poolid = tmem_new_pool(0, 0, 0);
++}
++EXPORT_SYMBOL(precache_init);
++
++void shared_precache_init(struct super_block *sb, char *uuid)
++{
++      u64 uuid_lo = *(u64 *)uuid;
++      u64 uuid_hi = *(u64 *)(&uuid[8]);
++      sb->precache_poolid = tmem_new_pool(uuid_lo, uuid_hi, TMEM_POOL_SHARED);
++}
++EXPORT_SYMBOL(shared_precache_init);
diff --cc mm/preswap.c

index 0000000,0000000..756f4bc

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/mm/preswap.c
@@@ -1,0 -1,0 +1,183 @@@
++/*
++ * linux/mm/preswap.c
++ *
++ * Implements a fast "preswap" on top of the transcendent memory ("tmem") API.
++ * When a swapdisk is enabled (with swapon), a "private persistent tmem pool"
++ * is created along with a bit-per-page preswap_map.  When swapping occurs
++ * and a page is about to be written to disk, a "put" into the pool may first
++ * be attempted by passing the pageframe to be swapped, along with a "handle"
++ * consisting of a pool_id, an object id, and an index.  Since the pool is of
++ * indeterminate size, the "put" may be rejected, in which case the page
++ * is swapped to disk as normal.  If the "put" is successful, the page is
++ * copied to tmem and the preswap_map records the success.  Later, when
++ * the page needs to be swapped in, the preswap_map is checked and, if set,
++ * the page may be obtained with a "get" operation.  Note that the swap
++ * subsystem is responsible for: maintaining coherency between the swapcache,
++ * preswap, and the swapdisk; for evicting stale pages from preswap; and for
++ * emptying preswap when swapoff is performed. The "flush page" and "flush
++ * object" actions are provided for this.
++ *
++ * Note that if a "duplicate put" is performed to overwrite a page and
++ * the "put" operation fails, the page (and old data) is flushed and lost.
++ * Also note that multiple accesses to a tmem pool may be concurrent and
++ * any ordering must be guaranteed by the caller.
++ *
++ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
++ */
++
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/sysctl.h>
++#include <linux/swap.h>
++#include <linux/swapops.h>
++#include <linux/proc_fs.h>
++#include <linux/security.h>
++#include <linux/capability.h>
++#include <linux/uaccess.h>
++#include "tmem.h"
++
++static u32 preswap_poolid = -1; /* if negative, preswap will never call tmem */
++
++const unsigned long preswap_zero = 0, preswap_infinity = ~0UL; /* for sysctl */
++
++/*
++ * Swizzling increases objects per swaptype, increasing tmem concurrency
++ * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
++ */
++#define SWIZ_BITS             4
++#define SWIZ_MASK             ((1 << SWIZ_BITS) - 1)
++#define oswiz(_type, _ind)    ((struct tmem_oid){ \
++      .oid[0] = (_type << SWIZ_BITS) | (_ind & SWIZ_MASK) })
++#define iswiz(_ind)           (_ind >> SWIZ_BITS)
++
++/*
++ * preswap_map test/set/clear operations (must be atomic)
++ */
++
++int preswap_test(struct swap_info_struct *sis, unsigned long offset)
++{
++      if (!sis->preswap_map)
++              return 0;
++      return test_bit(offset % BITS_PER_LONG,
++              &sis->preswap_map[offset/BITS_PER_LONG]);
++}
++
++static inline void preswap_set(struct swap_info_struct *sis,
++                              unsigned long offset)
++{
++      if (!sis->preswap_map)
++              return;
++      set_bit(offset % BITS_PER_LONG,
++              &sis->preswap_map[offset/BITS_PER_LONG]);
++}
++
++static inline void preswap_clear(struct swap_info_struct *sis,
++                              unsigned long offset)
++{
++      if (!sis->preswap_map)
++              return;
++      clear_bit(offset % BITS_PER_LONG,
++              &sis->preswap_map[offset/BITS_PER_LONG]);
++}
++
++/*
++ * preswap tmem operations
++ */
++
++/* returns 1 if the page was successfully put into preswap, 0 if the page
++ * was declined, and -ERRNO for a specific error */
++int preswap_put(struct page *page)
++{
++      swp_entry_t entry = { .val = page_private(page), };
++      unsigned type = swp_type(entry);
++      pgoff_t offset = swp_offset(entry);
++      u64 ind64 = (u64)offset;
++      u32 ind = (u32)offset;
++      unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
++      struct swap_info_struct *sis = get_swap_info_struct(type);
++      int dup = 0, ret;
++
++      if ((s32)preswap_poolid < 0)
++              return 0;
++      if (ind64 != ind)
++              return 0;
++      if (preswap_test(sis, offset))
++              dup = 1;
++      mb(); /* ensure page is quiescent; tmem may address it with an alias */
++      ret = tmem_put_page(preswap_poolid, oswiz(type, ind), iswiz(ind), mfn);
++      if (ret == 1) {
++              preswap_set(sis, offset);
++              if (!dup)
++                      sis->preswap_pages++;
++      } else if (dup) {
++              /* failed dup put always results in an automatic flush of
++               * the (older) page from preswap */
++              preswap_clear(sis, offset);
++              sis->preswap_pages--;
++      }
++      return ret;
++}
++
++/* returns 1 if the page was successfully gotten from preswap, 0 if the page
++ * was not present (should never happen!), and -ERRNO for a specific error */
++int preswap_get(struct page *page)
++{
++      swp_entry_t entry = { .val = page_private(page), };
++      unsigned type = swp_type(entry);
++      pgoff_t offset = swp_offset(entry);
++      u64 ind64 = (u64)offset;
++      u32 ind = (u32)offset;
++      unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
++      struct swap_info_struct *sis = get_swap_info_struct(type);
++      int ret;
++
++      if ((s32)preswap_poolid < 0)
++              return 0;
++      if (ind64 != ind)
++              return 0;
++      if (!preswap_test(sis, offset))
++              return 0;
++      ret = tmem_get_page(preswap_poolid, oswiz(type, ind), iswiz(ind), mfn);
++      return ret;
++}
++
++/* flush a single page from preswap */
++void preswap_flush(unsigned type, unsigned long offset)
++{
++      u64 ind64 = (u64)offset;
++      u32 ind = (u32)offset;
++      struct swap_info_struct *sis = get_swap_info_struct(type);
++      int ret = 1;
++
++      if ((s32)preswap_poolid < 0)
++              return;
++      if (ind64 != ind)
++              return;
++      if (preswap_test(sis, offset)) {
++              ret = tmem_flush_page(preswap_poolid,
++                                      oswiz(type, ind), iswiz(ind));
++              sis->preswap_pages--;
++              preswap_clear(sis, offset);
++      }
++}
++
++/* flush all pages from the passed swaptype */
++void preswap_flush_area(unsigned type)
++{
++      struct swap_info_struct *sis = get_swap_info_struct(type);
++      int ind;
++
++      if ((s32)preswap_poolid < 0)
++              return;
++      for (ind = SWIZ_MASK; ind >= 0; ind--)
++              (void)tmem_flush_object(preswap_poolid, oswiz(type, ind));
++      sis->preswap_pages = 0;
++}
++
++void preswap_init(unsigned type)
++{
++      /* only need one tmem pool for all swap types */
++      if ((s32)preswap_poolid >= 0)
++              return;
++      preswap_poolid = tmem_new_pool(0, 0, TMEM_POOL_PERSIST);
++}
diff --cc mm/swapfile.c

index d537d29,8c6b3ce..5e5a9d0
--- 1/mm/swapfile.c
--- 2/mm/swapfile.c
+++ b/mm/swapfile.c
@@@ -557,6 -556,6 +556,7 @@@ static unsigned char swap_entry_free(st
                         swap_list.next = p->type;
                 nr_swap_pages++;
                 p->inuse_pages--;
++              preswap_flush(p->type, offset);
                 if ((p->flags & SWP_BLKDEV) &&
                                 disk->fops->swap_slot_free_notify)
                         disk->fops->swap_slot_free_notify(p->bdev, offset);
@@@ -1022,7 -1021,7 +1022,7 @@@ static int unuse_mm(struct mm_struct *m
    * Recycle to start on reaching the end, returning 0 when empty.
    */
   static unsigned int find_next_to_unuse(struct swap_info_struct *si,
--                                      unsigned int prev)
++                              unsigned int prev, unsigned int preswap)
   {
         unsigned int max = si->max;
         unsigned int i = prev;
@@@ -1048,6 -1047,6 +1048,12 @@@
                         prev = 0;
                         i = 1;
                 }
++              if (preswap) {
++                      if (preswap_test(si, i))
++                              break;
++                      else
++                              continue;
++              }
                 count = si->swap_map[i];
                 if (count && swap_count(count) != SWAP_MAP_BAD)
                         break;
@@@ -1059,8 -1058,8 +1065,12 @@@
    * We completely avoid races by reading each swap page in advance,
    * and then search for the process using it.  All the necessary
    * page table adjustments can then be made atomically.
++ *
++ * if the boolean preswap is true, only unuse pages_to_unuse pages;
++ * pages_to_unuse==0 means all pages
    */
--static int try_to_unuse(unsigned int type)
++static int try_to_unuse(unsigned int type, unsigned int preswap,
++              unsigned long pages_to_unuse)
   {
         struct swap_info_struct *si = swap_info[type];
         struct mm_struct *start_mm;
@@@ -1093,7 -1092,7 +1103,7 @@@
          * one pass through swap_map is enough, but not necessarily:
          * there are races when an instance of an entry might be missed.
          */
--      while ((i = find_next_to_unuse(si, i)) != 0) {
++      while ((i = find_next_to_unuse(si, i, preswap)) != 0) {
                 if (signal_pending(current)) {
                         retval = -EINTR;
                         break;
@@@ -1260,6 -1259,6 +1270,8 @@@
                  * interactive performance.
                  */
                 cond_resched();
++              if (preswap && pages_to_unuse && !--pages_to_unuse)
++                      break;
         }
   
         mmput(start_mm);
@@@ -1518,8 -1517,8 +1530,12 @@@ bad_bmap
         goto out;
   }
   
++#ifndef CONFIG_PRESWAP
++#define enable_swap_info(p, prio, sm, psm) enable_swap_info(p, prio, sm)
++#endif
   static void enable_swap_info(struct swap_info_struct *p, int prio,
--                              unsigned char *swap_map)
++                           unsigned char *swap_map,
++                           unsigned long *preswap_map)
   {
         int i, prev;
   
@@@ -1529,6 -1528,6 +1545,9 @@@
         else
                 p->prio = --least_priority;
         p->swap_map = swap_map;
++#ifdef CONFIG_PRESWAP
++      p->preswap_map = preswap_map;
++#endif
         p->flags |= SWP_WRITEOK;
         nr_swap_pages += p->pages;
         total_swap_pages += p->pages;
@@@ -1545,6 -1544,6 +1564,7 @@@
                 swap_list.head = swap_list.next = p->type;
         else
                 swap_info[prev]->next = p->type;
++      preswap_init(p->type);
         spin_unlock(&swap_lock);
   }
   
@@@ -1615,9 -1613,9 +1634,9 @@@ SYSCALL_DEFINE1(swapoff, const char __u
         p->flags &= ~SWP_WRITEOK;
         spin_unlock(&swap_lock);
   
-       oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
-       err = try_to_unuse(type);
-       test_set_oom_score_adj(oom_score_adj);
+       current->flags |= PF_OOM_ORIGIN;
- -      err = try_to_unuse(type);
++      err = try_to_unuse(type, 0, 0);
+       current->flags &= ~PF_OOM_ORIGIN;
   
         if (err) {
                 /*
@@@ -1627,7 -1625,7 +1646,7 @@@
                  * sys_swapoff for this swap_info_struct at this point.
                  */
                 /* re-insert swap space back into swap_list */
--              enable_swap_info(p, p->prio, p->swap_map);
++              enable_swap_info(p, p->prio, p->swap_map, p->preswap_map);
                 goto out_dput;
         }
   
@@@ -1653,9 -1651,9 +1672,14 @@@
         swap_map = p->swap_map;
         p->swap_map = NULL;
         p->flags = 0;
++      preswap_flush_area(type);
         spin_unlock(&swap_lock);
         mutex_unlock(&swapon_mutex);
         vfree(swap_map);
++#ifdef CONFIG_PRESWAP
++      if (p->preswap_map)
++              vfree(p->preswap_map);
++#endif
         /* Destroy swap account informatin */
         swap_cgroup_swapoff(type);
   
@@@ -2028,6 -2026,6 +2052,7 @@@ SYSCALL_DEFINE2(swapon, const char __us
         sector_t span;
         unsigned long maxpages;
         unsigned char *swap_map = NULL;
++      unsigned long *preswap_map = NULL;
         struct page *page = NULL;
         struct inode *inode = NULL;
   
@@@ -2098,6 -2096,6 +2123,10 @@@
                 goto bad_swap;
         }
   
++#ifdef CONFIG_PRESWAP
++      preswap_map = vzalloc(maxpages / sizeof(long));
++#endif
++
         error = swap_cgroup_swapon(p->type, maxpages);
         if (error)
                 goto bad_swap;
@@@ -2123,7 -2121,7 +2152,7 @@@
         if (swap_flags & SWAP_FLAG_PREFER)
                 prio =
                   (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
--      enable_swap_info(p, prio, swap_map);
++      enable_swap_info(p, prio, swap_map, preswap_map);
   
         printk(KERN_INFO "Adding %uk swap on %s.  "
                         "Priority:%d extents:%d across:%lluk %s%s\n",
@@@ -2151,6 -2149,6 +2180,7 @@@ bad_swap
         p->swap_file = NULL;
         p->flags = 0;
         spin_unlock(&swap_lock);
++      vfree(preswap_map);
         vfree(swap_map);
         if (swap_file) {
                 if (inode && S_ISREG(inode->i_mode)) {
@@@ -2321,6 -2319,6 +2351,10 @@@ int valid_swaphandles(swp_entry_t entry
                 base++;
   
         spin_lock(&swap_lock);
++      if (preswap_test(si, target)) {
++              spin_unlock(&swap_lock);
++              return 0;
++      }
         if (end > si->max)      /* don't go beyond end of map */
                 end = si->max;
   
@@@ -2331,6 -2329,6 +2365,9 @@@
                         break;
                 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
                         break;
++              /* Don't read in preswap pages */
++              if (preswap_test(si, toff))
++                      break;
         }
         /* Count contiguous allocated slots below our target */
         for (toff = target; --toff >= base; nr_pages++) {
@@@ -2339,6 -2337,6 +2376,9 @@@
                         break;
                 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
                         break;
++              /* Don't read in preswap pages */
++              if (preswap_test(si, toff))
++                      break;
         }
         spin_unlock(&swap_lock);
   
@@@ -2565,3 -2563,3 +2605,98 @@@ static void free_swap_count_continuatio
                 }
         }
   }
++
++#ifdef CONFIG_PRESWAP
++/*
++ * preswap infrastructure functions
++ */
++
++struct swap_info_struct *get_swap_info_struct(unsigned int type)
++{
++      BUG_ON(type > MAX_SWAPFILES);
++      return swap_info[type];
++}
++
++/* code structure leveraged from sys_swapoff */
++void preswap_shrink(unsigned long target_pages)
++{
++      struct swap_info_struct *si = NULL;
++      unsigned long total_pages = 0, total_pages_to_unuse;
++      unsigned long pages = 0, unuse_pages = 0;
++      int type;
++      int wrapped = 0;
++
++      do {
++              /*
++               * we don't want to hold swap_lock while doing a very
++               * lengthy try_to_unuse, but swap_list may change
++               * so restart scan from swap_list.head each time
++               */
++              spin_lock(&swap_lock);
++              total_pages = 0;
++              for (type = swap_list.head; type >= 0; type = si->next) {
++                      si = swap_info[type];
++                      total_pages += si->preswap_pages;
++              }
++              if (total_pages <= target_pages) {
++                      spin_unlock(&swap_lock);
++                      return;
++              }
++              total_pages_to_unuse = total_pages - target_pages;
++              for (type = swap_list.head; type >= 0; type = si->next) {
++                      si = swap_info[type];
++                      if (total_pages_to_unuse < si->preswap_pages)
++                              pages = unuse_pages = total_pages_to_unuse;
++                      else {
++                              pages = si->preswap_pages;
++                              unuse_pages = 0; /* unuse all */
++                      }
++                      if (security_vm_enough_memory_kern(pages))
++                              continue;
++                      vm_unacct_memory(pages);
++                      break;
++              }
++              spin_unlock(&swap_lock);
++              if (type < 0)
++                      return;
++              current->flags |= PF_OOM_ORIGIN;
++              (void)try_to_unuse(type, 1, unuse_pages);
++              current->flags &= ~PF_OOM_ORIGIN;
++              wrapped++;
++      } while (wrapped <= 3);
++}
++
++
++#ifdef CONFIG_SYSCTL
++/* cat /sys/proc/vm/preswap provides total number of pages in preswap
++ * across all swaptypes.  echo N > /sys/proc/vm/preswap attempts to shrink
++ * preswap page usage to N (usually 0) */
++int preswap_sysctl_handler(ctl_table *table, int write,
++      void __user *buffer, size_t *length, loff_t *ppos)
++{
++      unsigned long npages;
++      int type;
++      unsigned long totalpages = 0;
++      struct swap_info_struct *si = NULL;
++
++      /* modeled after hugetlb_sysctl_handler in mm/hugetlb.c */
++      if (!write) {
++              spin_lock(&swap_lock);
++              for (type = swap_list.head; type >= 0; type = si->next) {
++                      si = swap_info[type];
++                      totalpages += si->preswap_pages;
++              }
++              spin_unlock(&swap_lock);
++              npages = totalpages;
++      }
++      table->data = &npages;
++      table->maxlen = sizeof(unsigned long);
++      proc_doulongvec_minmax(table, write, buffer, length, ppos);
++
++      if (write)
++              preswap_shrink(npages);
++
++      return 0;
++}
++#endif
++#endif /* CONFIG_PRESWAP */
diff --cc mm/tmem-xen.c

index 0000000,0000000..d79398a

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/mm/tmem-xen.c
@@@ -1,0 -1,0 +1,56 @@@
++/*
++ * Xen implementation for transcendent memory (tmem)
++ *
++ * Dan Magenheimer <dan.magenheimer@oracle.com> 2009
++ */
++
++#include <linux/types.h>
++#include <xen/interface/xen.h>
++#include <asm/hypervisor.h>
++#include "tmem.h"
++
++int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid, u32 index,
++      unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
++{
++      struct tmem_op op;
++      int rc = 0;
++
++      op.cmd = tmem_cmd;
++      op.pool_id = tmem_pool;
++      BUILD_BUG_ON(sizeof(op.u.gen.oid) != sizeof(oid.oid));
++      memcpy(op.u.gen.oid, oid.oid, sizeof(op.u.gen.oid));
++      op.u.gen.index = index;
++      op.u.gen.tmem_offset = tmem_offset;
++      op.u.gen.pfn_offset = pfn_offset;
++      op.u.gen.len = len;
++      op.u.gen.cmfn = gmfn;
++      rc = HYPERVISOR_tmem_op(&op);
++      return rc;
++}
++
++int xen_tmem_new_pool(uint32_t tmem_cmd, struct tmem_pool_uuid uuid,
++      uint32_t flags)
++{
++      struct tmem_op op;
++      int rc = 0;
++
++      op.cmd = tmem_cmd;
++      op.u.creat.uuid[0] = uuid.lo;
++      op.u.creat.uuid[1] = uuid.hi;
++#ifdef TMEM_SPEC_VERSION
++      switch (flags >> TMEM_POOL_VERSION_SHIFT) {
++      case 0:
++              flags |= TMEM_SPEC_VERSION << TMEM_POOL_VERSION_SHIFT;
++              break;
++      case TMEM_SPEC_VERSION:
++              break;
++      default:
++              WARN(1, "TMEM: Bogus version %u, expecting %u\n",
++                   flags >> TMEM_POOL_VERSION_SHIFT, TMEM_SPEC_VERSION);
++              return -ENOSYS;
++      }
++#endif
++      op.u.creat.flags = flags;
++      rc = HYPERVISOR_tmem_op(&op);
++      return rc;
++}
diff --cc mm/tmem.h

index 0000000,0000000..c57a23b

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/mm/tmem.h
@@@ -1,0 -1,0 +1,71 @@@
++/*
++ * linux/mm/tmem.h
++ *
++ * Interface to transcendent memory, used by mm/precache.c and mm/preswap.c
++ * Currently implemented on XEN, but may be implemented elsewhere in future.
++ *
++ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
++ */
++
++#ifdef CONFIG_XEN
++#include <xen/interface/xen.h>
++
++/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
++#define TMEM_POOL_MIN_PAGESHIFT   12
++#define TMEM_POOL_PAGEORDER       (PAGE_SHIFT - TMEM_POOL_MIN_PAGESHIFT)
++
++struct tmem_pool_uuid {
++      u64 lo;
++      u64 hi;
++};
++
++struct tmem_oid {
++      u64 oid[3];
++};
++
++extern int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid, u32 index,
++      unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len);
++extern int xen_tmem_new_pool(u32 tmem_cmd, struct tmem_pool_uuid, u32 flags);
++
++static inline int tmem_put_page(u32 pool_id, struct tmem_oid oid, u32 index,
++      unsigned long gmfn)
++{
++      return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index,
++              gmfn, 0, 0, 0);
++}
++
++static inline int tmem_get_page(u32 pool_id, struct tmem_oid oid, u32 index,
++      unsigned long gmfn)
++{
++      return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index,
++              gmfn, 0, 0, 0);
++}
++
++static inline int tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index)
++{
++      return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index,
++              0, 0, 0, 0);
++}
++
++static inline int tmem_flush_object(u32 pool_id, struct tmem_oid oid)
++{
++      return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0);
++}
++
++static inline int tmem_new_pool(u64 uuid_lo, u64 uuid_hi, u32 flags)
++{
++      struct tmem_pool_uuid uuid = { .lo = uuid_lo, .hi = uuid_hi };
++
++      BUILD_BUG_ON((TMEM_POOL_PAGEORDER < 0) ||
++              (TMEM_POOL_PAGEORDER >= TMEM_POOL_PAGESIZE_MASK));
++      flags |= TMEM_POOL_PAGEORDER << TMEM_POOL_PAGESIZE_SHIFT;
++      return xen_tmem_new_pool(TMEM_NEW_POOL, uuid, flags);
++}
++
++static inline int tmem_destroy_pool(u32 pool_id)
++{
++      static const struct tmem_oid oid = {};
++
++      return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
++}
++#endif
diff --cc mm/truncate.c

index b37ba65,a956675..31cee33
--- 1/mm/truncate.c
--- 2/mm/truncate.c
+++ b/mm/truncate.c
@@@ -16,6 -16,6 +16,7 @@@
   #include <linux/pagemap.h>
   #include <linux/highmem.h>
   #include <linux/pagevec.h>
++#include <linux/precache.h>
   #include <linux/task_io_accounting_ops.h>
   #include <linux/buffer_head.h>        /* grr. try_to_release_page,
                                    do_invalidatepage */
@@@ -52,7 -51,6 +52,7 @@@ void do_invalidatepage(struct page *pag
   static inline void truncate_partial_page(struct page *page, unsigned partial)
   {
         zero_user_segment(page, partial, PAGE_CACHE_SIZE);
-       cleancache_flush_page(page->mapping, page);
++      precache_flush(page->mapping, page->index);
         if (page_has_private(page))
                 do_invalidatepage(page, partial);
   }
@@@ -216,7 -214,6 +216,7 @@@ void truncate_inode_pages_range(struct 
         pgoff_t next;
         int i;
   
-       cleancache_flush_inode(mapping);
++      precache_flush_inode(mapping);
         if (mapping->nrpages == 0)
                 return;
   
@@@ -294,14 -291,6 +294,14 @@@
                 pagevec_release(&pvec);
                 mem_cgroup_uncharge_end();
         }
-       cleancache_flush_inode(mapping);
+ +      /*
+ +       * Cycle the tree_lock to make sure all __delete_from_page_cache()
+ +       * calls run from page reclaim have finished as well (this handles the
+ +       * case when page reclaim took the last page from our range).
+ +       */
+ +      spin_lock_irq(&mapping->tree_lock);
+ +      spin_unlock_irq(&mapping->tree_lock);
++      precache_flush_inode(mapping);
   }
   EXPORT_SYMBOL(truncate_inode_pages_range);
   
@@@ -451,7 -440,6 +451,7 @@@ int invalidate_inode_pages2_range(struc
         int did_range_unmap = 0;
         int wrapped = 0;
   
-       cleancache_flush_inode(mapping);
++      precache_flush_inode(mapping);
         pagevec_init(&pvec, 0);
         next = start;
         while (next <= end && !wrapped &&
@@@ -510,7 -498,6 +510,7 @@@
                 mem_cgroup_uncharge_end();
                 cond_resched();
         }
-       cleancache_flush_inode(mapping);
++      precache_flush_inode(mapping);
         return ret;
   }
   EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --cc mm/vmalloc.c

index 1d34d75,5d60302..0e775c7
--- 1/mm/vmalloc.c
--- 2/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@@ -1537,7 -1537,6 +1537,13 @@@ static void *__vmalloc_area_node(struc
         struct page **pages;
         unsigned int nr_pages, array_size, i;
         gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
++#ifdef CONFIG_XEN
++      gfp_t dma_mask = gfp_mask & (__GFP_DMA | __GFP_DMA32);
++
++      BUILD_BUG_ON((__GFP_DMA | __GFP_DMA32) != (__GFP_DMA + __GFP_DMA32));
++      if (dma_mask == (__GFP_DMA | __GFP_DMA32))
++              gfp_mask &= ~(__GFP_DMA | __GFP_DMA32);
++#endif
   
         nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
         array_size = (nr_pages * sizeof(struct page *));
@@@ -1574,6 -1572,6 +1579,16 @@@
                         goto fail;
                 }
                 area->pages[i] = page;
++#ifdef CONFIG_XEN
++              if (dma_mask) {
++                      if (xen_limit_pages_to_max_mfn(page, 0, 32)) {
++                              area->nr_pages = i + 1;
++                              goto fail;
++                      }
++                      if (gfp_mask & __GFP_ZERO)
++                              clear_highpage(page);
++              }
++#endif
         }
   
         if (map_vm_area(area, prot, &pages))
@@@ -1786,6 -1781,6 +1798,8 @@@ void *vmalloc_exec(unsigned long size
   #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
   #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
   #define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
++#elif defined(CONFIG_XEN)
++#define GFP_VMALLOC32 __GFP_DMA | __GFP_DMA32 | GFP_KERNEL
   #else
   #define GFP_VMALLOC32 GFP_KERNEL
   #endif
diff --cc net/bridge/br_if.c

index 3c79b92,718b603..14a4c1c
--- 1/net/bridge/br_if.c
--- 2/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@@ -239,17 -290,29 +290,36 @@@ int br_add_bridge(struct net *net, cons
         if (!dev)
                 return -ENOMEM;
   
+ +      if (!try_module_get(THIS_MODULE)) {
+ +              free_netdev(dev);
+ +              return -ENOENT;
+ +      }
+ +
-       dev_net_set(dev, net);
+       rtnl_lock();
+       if (strchr(dev->name, '%')) {
+               ret = dev_alloc_name(dev, dev->name);
+               if (ret < 0)
+                       goto out_free;
+       }
+ 
+       SET_NETDEV_DEVTYPE(dev, &br_type);
   
-       ret = register_netdev(dev);
+       ret = register_netdevice(dev);
+       if (ret)
+               goto out_free;
+ 
+       ret = br_sysfs_addbr(dev);
+       if (ret)
+               unregister_netdevice(dev);
+  out:
+       rtnl_unlock();
+ +      if (ret)
+ +              module_put(THIS_MODULE);
         return ret;
+ 
+ out_free:
+       free_netdev(dev);
+       goto out;
   }
   
   int br_del_bridge(struct net *net, const char *name)
diff --cc net/ipv6/addrconf.c

index 498b927,a7bda07..fed3c17
--- 1/net/ipv6/addrconf.c
--- 2/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@@ -2835,6 -2845,6 +2845,7 @@@ static void addrconf_dad_start(struct i
                 goto out;
   
         if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
++          !(dev->flags&IFF_MULTICAST) ||
             idev->cnf.accept_dad < 1 ||
             !(ifp->flags&IFA_F_TENTATIVE) ||
             ifp->flags & IFA_F_NODAD) {
@@@ -2938,6 -2948,6 +2949,7 @@@ static void addrconf_dad_completed(stru
              ifp->idev->cnf.forwarding == 2) &&
             ifp->idev->cnf.rtr_solicits > 0 &&
             (dev->flags&IFF_LOOPBACK) == 0 &&
++          (dev->flags&IFF_MULTICAST) &&
             (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
                 /*
                  *      If a host as already performed a random delay
diff --cc scripts/Makefile.build

index d169a79,d5f925a..2716ca1
--- 1/scripts/Makefile.build
--- 2/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@@ -126,6 -110,6 +110,21 @@@ ifndef ob
   $(warning kbuild: Makefile.build is included improperly)
   endif
   
++ifeq ($(CONFIG_XEN),y)
++Makefile.xen := $(if $(KBUILD_EXTMOD),$(KBUILD_EXTMOD),$(objtree)/scripts)/Makefile.xen
++$(Makefile.xen): $(srctree)/scripts/Makefile.xen.awk $(srctree)/scripts/Makefile.build
++      @echo '  Updating $@'
++      $(if $(shell echo a | $(AWK) '{ print gensub(/a/, "AA", "g"); }'),\
++        ,$(error 'Your awk program does not define gensub.  Use gawk or another awk with gensub'))
++      @$(AWK) -f $< $(filter-out $<,$^) >$@
++
++xen-src-single-used-m := $(patsubst $(srctree)/%,%,$(wildcard $(addprefix $(srctree)/,$(single-used-m:.o=-xen.c))))
++xen-single-used-m     := $(xen-src-single-used-m:-xen.c=.o)
++single-used-m         := $(filter-out $(xen-single-used-m),$(single-used-m))
++
++-include $(Makefile.xen)
++endif
++
   # ===========================================================================
   
   ifneq ($(strip $(lib-y) $(lib-m) $(lib-n) $(lib-)),)
@@@ -302,16 -279,14 +295,16 @@@ define rule_cc_o_
   endef
   
   # Built-in and composite module parts
- $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE
+ $(obj)/%.o: $(src)/%.c FORCE
         $(call cmd,force_checksrc)
+ +      $(call cmd,force_check_kmsg)
         $(call if_changed_rule,cc_o_c)
   
   # Single-part modules are special since we need to mark them in $(MODVERDIR)
   
- $(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE
+ $(single-used-m): $(obj)/%.o: $(src)/%.c FORCE
         $(call cmd,force_checksrc)
+ +      $(call cmd,force_check_kmsg)
         $(call if_changed_rule,cc_o_c)
         @{ echo $(@:.o=.ko); echo $@; } > $(MODVERDIR)/$(@F:.o=.mod)
   
diff --cc scripts/Makefile.lib

index 93b2b59,1c702ca..d782a86
--- 1/scripts/Makefile.lib
--- 2/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@@ -22,6 -22,6 +22,12 @@@ obj-m := $(filter-out $(obj-y),$(obj-m)
   
   lib-y := $(filter-out $(obj-y), $(sort $(lib-y) $(lib-m)))
   
++# Remove objects forcibly disabled
++
++obj-y := $(filter-out $(disabled-obj-y),$(obj-y))
++obj-m := $(filter-out $(disabled-obj-y),$(obj-m))
++lib-y := $(filter-out $(disabled-obj-y),$(lib-y))
++
   
   # Handle objects in subdirs
   # ---------------------------------------------------------------------------
diff --cc scripts/Makefile.xen.awk

index 0000000,0000000..1f7cf1e

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/scripts/Makefile.xen.awk
@@@ -1,0 -1,0 +1,34 @@@
++BEGIN {
++      is_rule = 0
++}
++
++/^[[:space:]]*#/ {
++      next
++}
++
++/^[[:space:]]*$/ {
++      if (is_rule)
++              print("")
++      is_rule = 0
++      next
++}
++
++/:[[:space:]]*\$\(src\)\/%\.[cS][[:space:]]/ {
++      line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
++      line = gensub(/(single-used-m)/, "xen-\\1", "g", line)
++      print line
++      is_rule = 1
++      next
++}
++
++/^[^\t]$/ {
++      if (is_rule)
++              print("")
++      is_rule = 0
++      next
++}
++
++is_rule {
++      print $0
++      next
++}
diff --cc scripts/kconfig/Makefile
Simple merge
diff --cc scripts/mkcompile_h

index f221ddf,50ad317..71c57ca
--- 1/scripts/mkcompile_h
--- 2/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@@ -73,8 -63,20 +63,10 @@@ UTS_TRUNCATE="cut -b -$UTS_LEN
   
     echo \#define UTS_VERSION \"`echo $UTS_VERSION | $UTS_TRUNCATE`\"
   
-   echo \#define LINUX_COMPILE_BY \"`echo $LINUX_COMPILE_BY | $UTS_TRUNCATE`\"
-   echo \#define LINUX_COMPILE_HOST \"`echo $LINUX_COMPILE_HOST | $UTS_TRUNCATE`\"
+   echo \#define LINUX_COMPILE_TIME \"`date +%T`\"
- -  echo \#define LINUX_COMPILE_BY \"`whoami`\"
- -  echo \#define LINUX_COMPILE_HOST \"`hostname | $UTS_TRUNCATE`\"
- -
- -  domain=`dnsdomainname 2> /dev/null`
- -  if [ -z "$domain" ]; then
- -    domain=`domainname 2> /dev/null`
- -  fi
- -
- -  if [ -n "$domain" ]; then
- -    echo \#define LINUX_COMPILE_DOMAIN \"`echo $domain | $UTS_TRUNCATE`\"
- -  else
- -    echo \#define LINUX_COMPILE_DOMAIN
- -  fi
++  echo \#define LINUX_COMPILE_BY \"geeko\"
++  echo \#define LINUX_COMPILE_HOST \"buildhost\"
++  echo \#define LINUX_COMPILE_DOMAIN \"suse.de\"
   
     echo \#define LINUX_COMPILER \"`$CC -v 2>&1 | tail -n 1`\"
   ) > .tmpcompile
diff --cc scripts/mod/modpost.c
Simple merge
diff --cc security/apparmor/match.c
Simple merge
diff --cc security/apparmor/policy_unpack.c
Simple merge
diff --cc sound/pci/hda/hda_intel.c
Simple merge
diff --cc virt/kvm/ioapic.c
Simple merge
author	Jeff Mahoney <jeffm@suse.com>
	Tue, 31 May 2011 03:09:44 +0000 (23:09 -0400)
committer	Jeff Mahoney <jeffm@suse.com>
	Tue, 31 May 2011 03:09:44 +0000 (23:09 -0400)